]> git.proxmox.com Git - mirror_spl-debian.git/blob - module/spl/spl-kmem.c
Rename modules to module and update references
[mirror_spl-debian.git] / module / spl / spl-kmem.c
1 /*
2 * This file is part of the SPL: Solaris Porting Layer.
3 *
4 * Copyright (c) 2008 Lawrence Livermore National Security, LLC.
5 * Produced at Lawrence Livermore National Laboratory
6 * Written by:
7 * Brian Behlendorf <behlendorf1@llnl.gov>,
8 * Herb Wartens <wartens2@llnl.gov>,
9 * Jim Garlick <garlick@llnl.gov>
10 * UCRL-CODE-235197
11 *
12 * This is free software; you can redistribute it and/or modify it
13 * under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This is distributed in the hope that it will be useful, but WITHOUT
18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
20 * for more details.
21 *
22 * You should have received a copy of the GNU General Public License along
23 * with this program; if not, write to the Free Software Foundation, Inc.,
24 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 */
26
27 #include <sys/kmem.h>
28
29 #ifdef DEBUG_SUBSYSTEM
30 # undef DEBUG_SUBSYSTEM
31 #endif
32
33 #define DEBUG_SUBSYSTEM S_KMEM
34
35 /*
36 * Memory allocation interfaces and debugging for basic kmem_*
37 * and vmem_* style memory allocation. When DEBUG_KMEM is enable
38 * all allocations will be tracked when they are allocated and
39 * freed. When the SPL module is unload a list of all leaked
40 * addresses and where they were allocated will be dumped to the
41 * console. Enabling this feature has a significant impant on
42 * performance but it makes finding memory leaks staight forward.
43 */
44 #ifdef DEBUG_KMEM
45 /* Shim layer memory accounting */
46 atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
47 unsigned long long kmem_alloc_max = 0;
48 atomic64_t vmem_alloc_used = ATOMIC64_INIT(0);
49 unsigned long long vmem_alloc_max = 0;
50 int kmem_warning_flag = 1;
51
52 EXPORT_SYMBOL(kmem_alloc_used);
53 EXPORT_SYMBOL(kmem_alloc_max);
54 EXPORT_SYMBOL(vmem_alloc_used);
55 EXPORT_SYMBOL(vmem_alloc_max);
56 EXPORT_SYMBOL(kmem_warning_flag);
57
58 # ifdef DEBUG_KMEM_TRACKING
59
60 /* XXX - Not to surprisingly with debugging enabled the xmem_locks are very
61 * highly contended particularly on xfree(). If we want to run with this
62 * detailed debugging enabled for anything other than debugging we need to
63 * minimize the contention by moving to a lock per xmem_table entry model.
64 */
65
66 # define KMEM_HASH_BITS 10
67 # define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS)
68
69 # define VMEM_HASH_BITS 10
70 # define VMEM_TABLE_SIZE (1 << VMEM_HASH_BITS)
71
72 typedef struct kmem_debug {
73 struct hlist_node kd_hlist; /* Hash node linkage */
74 struct list_head kd_list; /* List of all allocations */
75 void *kd_addr; /* Allocation pointer */
76 size_t kd_size; /* Allocation size */
77 const char *kd_func; /* Allocation function */
78 int kd_line; /* Allocation line */
79 } kmem_debug_t;
80
81 spinlock_t kmem_lock;
82 struct hlist_head kmem_table[KMEM_TABLE_SIZE];
83 struct list_head kmem_list;
84
85 spinlock_t vmem_lock;
86 struct hlist_head vmem_table[VMEM_TABLE_SIZE];
87 struct list_head vmem_list;
88
89 EXPORT_SYMBOL(kmem_lock);
90 EXPORT_SYMBOL(kmem_table);
91 EXPORT_SYMBOL(kmem_list);
92
93 EXPORT_SYMBOL(vmem_lock);
94 EXPORT_SYMBOL(vmem_table);
95 EXPORT_SYMBOL(vmem_list);
96 # endif
97
98 int kmem_set_warning(int flag) { return (kmem_warning_flag = !!flag); }
99 #else
100 int kmem_set_warning(int flag) { return 0; }
101 #endif
102 EXPORT_SYMBOL(kmem_set_warning);
103
104 /*
105 * Slab allocation interfaces
106 *
107 * While the Linux slab implementation was inspired by the Solaris
108 * implemenation I cannot use it to emulate the Solaris APIs. I
109 * require two features which are not provided by the Linux slab.
110 *
111 * 1) Constructors AND destructors. Recent versions of the Linux
112 * kernel have removed support for destructors. This is a deal
113 * breaker for the SPL which contains particularly expensive
114 * initializers for mutex's, condition variables, etc. We also
115 * require a minimal level of cleanup for these data types unlike
116 * many Linux data type which do need to be explicitly destroyed.
117 *
118 * 2) Virtual address space backed slab. Callers of the Solaris slab
119 * expect it to work well for both small are very large allocations.
120 * Because of memory fragmentation the Linux slab which is backed
121 * by kmalloc'ed memory performs very badly when confronted with
122 * large numbers of large allocations. Basing the slab on the
123 * virtual address space removes the need for contigeous pages
124 * and greatly improve performance for large allocations.
125 *
126 * For these reasons, the SPL has its own slab implementation with
127 * the needed features. It is not as highly optimized as either the
128 * Solaris or Linux slabs, but it should get me most of what is
129 * needed until it can be optimized or obsoleted by another approach.
130 *
131 * One serious concern I do have about this method is the relatively
132 * small virtual address space on 32bit arches. This will seriously
133 * constrain the size of the slab caches and their performance.
134 *
135 * XXX: Implement work requests to keep an eye on each cache and
136 * shrink them via spl_slab_reclaim() when they are wasting lots
137 * of space. Currently this process is driven by the reapers.
138 *
139 * XXX: Improve the partial slab list by carefully maintaining a
140 * strict ordering of fullest to emptiest slabs based on
141 * the slab reference count. This gaurentees the when freeing
142 * slabs back to the system we need only linearly traverse the
143 * last N slabs in the list to discover all the freeable slabs.
144 *
145 * XXX: NUMA awareness for optionally allocating memory close to a
146 * particular core. This can be adventageous if you know the slab
147 * object will be short lived and primarily accessed from one core.
148 *
149 * XXX: Slab coloring may also yield performance improvements and would
150 * be desirable to implement.
151 *
152 * XXX: Proper hardware cache alignment would be good too.
153 */
154
155 struct list_head spl_kmem_cache_list; /* List of caches */
156 struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
157
158 static int spl_cache_flush(spl_kmem_cache_t *skc,
159 spl_kmem_magazine_t *skm, int flush);
160
161 #ifdef HAVE_SET_SHRINKER
162 static struct shrinker *spl_kmem_cache_shrinker;
163 #else
164 static int spl_kmem_cache_generic_shrinker(int nr_to_scan,
165 unsigned int gfp_mask);
166 static struct shrinker spl_kmem_cache_shrinker = {
167 .shrink = spl_kmem_cache_generic_shrinker,
168 .seeks = KMC_DEFAULT_SEEKS,
169 };
170 #endif
171
172 #ifdef DEBUG_KMEM
173 # ifdef DEBUG_KMEM_TRACKING
174
175 static kmem_debug_t *
176 kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits,
177 void *addr)
178 {
179 struct hlist_head *head;
180 struct hlist_node *node;
181 struct kmem_debug *p;
182 unsigned long flags;
183 ENTRY;
184
185 spin_lock_irqsave(lock, flags);
186
187 head = &table[hash_ptr(addr, bits)];
188 hlist_for_each_entry_rcu(p, node, head, kd_hlist) {
189 if (p->kd_addr == addr) {
190 hlist_del_init(&p->kd_hlist);
191 list_del_init(&p->kd_list);
192 spin_unlock_irqrestore(lock, flags);
193 return p;
194 }
195 }
196
197 spin_unlock_irqrestore(lock, flags);
198
199 RETURN(NULL);
200 }
201
202 void *
203 kmem_alloc_track(size_t size, int flags, const char *func, int line,
204 int node_alloc, int node)
205 {
206 void *ptr = NULL;
207 kmem_debug_t *dptr;
208 unsigned long irq_flags;
209 ENTRY;
210
211 dptr = (kmem_debug_t *) kmalloc(sizeof(kmem_debug_t),
212 flags & ~__GFP_ZERO);
213
214 if (dptr == NULL) {
215 CWARN("kmem_alloc(%ld, 0x%x) debug failed\n",
216 sizeof(kmem_debug_t), flags);
217 } else {
218 /* Marked unlikely because we should never be doing this,
219 * we tolerate to up 2 pages but a single page is best. */
220 if (unlikely((size) > (PAGE_SIZE * 2)) && kmem_warning_flag)
221 CWARN("Large kmem_alloc(%llu, 0x%x) (%lld/%llu)\n",
222 (unsigned long long) size, flags,
223 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
224
225 /* We use kstrdup() below because the string pointed to by
226 * __FUNCTION__ might not be available by the time we want
227 * to print it since the module might have been unloaded. */
228 dptr->kd_func = kstrdup(func, flags & ~__GFP_ZERO);
229 if (unlikely(dptr->kd_func == NULL)) {
230 kfree(dptr);
231 CWARN("kstrdup() failed in kmem_alloc(%llu, 0x%x) "
232 "(%lld/%llu)\n", (unsigned long long) size, flags,
233 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
234 goto out;
235 }
236
237 /* Use the correct allocator */
238 if (node_alloc) {
239 ASSERT(!(flags & __GFP_ZERO));
240 ptr = kmalloc_node(size, flags, node);
241 } else if (flags & __GFP_ZERO) {
242 ptr = kzalloc(size, flags & ~__GFP_ZERO);
243 } else {
244 ptr = kmalloc(size, flags);
245 }
246
247 if (unlikely(ptr == NULL)) {
248 kfree(dptr->kd_func);
249 kfree(dptr);
250 CWARN("kmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
251 (unsigned long long) size, flags,
252 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
253 goto out;
254 }
255
256 atomic64_add(size, &kmem_alloc_used);
257 if (unlikely(atomic64_read(&kmem_alloc_used) >
258 kmem_alloc_max))
259 kmem_alloc_max =
260 atomic64_read(&kmem_alloc_used);
261
262 INIT_HLIST_NODE(&dptr->kd_hlist);
263 INIT_LIST_HEAD(&dptr->kd_list);
264
265 dptr->kd_addr = ptr;
266 dptr->kd_size = size;
267 dptr->kd_line = line;
268
269 spin_lock_irqsave(&kmem_lock, irq_flags);
270 hlist_add_head_rcu(&dptr->kd_hlist,
271 &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
272 list_add_tail(&dptr->kd_list, &kmem_list);
273 spin_unlock_irqrestore(&kmem_lock, irq_flags);
274
275 CDEBUG_LIMIT(D_INFO, "kmem_alloc(%llu, 0x%x) = %p "
276 "(%lld/%llu)\n", (unsigned long long) size, flags,
277 ptr, atomic64_read(&kmem_alloc_used),
278 kmem_alloc_max);
279 }
280 out:
281 RETURN(ptr);
282 }
283 EXPORT_SYMBOL(kmem_alloc_track);
284
285 void
286 kmem_free_track(void *ptr, size_t size)
287 {
288 kmem_debug_t *dptr;
289 ENTRY;
290
291 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
292 (unsigned long long) size);
293
294 dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
295
296 ASSERT(dptr); /* Must exist in hash due to kmem_alloc() */
297
298 /* Size must match */
299 ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
300 "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
301 (unsigned long long) size, dptr->kd_func, dptr->kd_line);
302
303 atomic64_sub(size, &kmem_alloc_used);
304
305 CDEBUG_LIMIT(D_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
306 (unsigned long long) size, atomic64_read(&kmem_alloc_used),
307 kmem_alloc_max);
308
309 kfree(dptr->kd_func);
310
311 memset(dptr, 0x5a, sizeof(kmem_debug_t));
312 kfree(dptr);
313
314 memset(ptr, 0x5a, size);
315 kfree(ptr);
316
317 EXIT;
318 }
319 EXPORT_SYMBOL(kmem_free_track);
320
321 void *
322 vmem_alloc_track(size_t size, int flags, const char *func, int line)
323 {
324 void *ptr = NULL;
325 kmem_debug_t *dptr;
326 unsigned long irq_flags;
327 ENTRY;
328
329 ASSERT(flags & KM_SLEEP);
330
331 dptr = (kmem_debug_t *) kmalloc(sizeof(kmem_debug_t), flags);
332 if (dptr == NULL) {
333 CWARN("vmem_alloc(%ld, 0x%x) debug failed\n",
334 sizeof(kmem_debug_t), flags);
335 } else {
336 /* We use kstrdup() below because the string pointed to by
337 * __FUNCTION__ might not be available by the time we want
338 * to print it, since the module might have been unloaded. */
339 dptr->kd_func = kstrdup(func, flags & ~__GFP_ZERO);
340 if (unlikely(dptr->kd_func == NULL)) {
341 kfree(dptr);
342 CWARN("kstrdup() failed in vmem_alloc(%llu, 0x%x) "
343 "(%lld/%llu)\n", (unsigned long long) size, flags,
344 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
345 goto out;
346 }
347
348 ptr = __vmalloc(size, (flags | __GFP_HIGHMEM) & ~__GFP_ZERO,
349 PAGE_KERNEL);
350
351 if (unlikely(ptr == NULL)) {
352 kfree(dptr->kd_func);
353 kfree(dptr);
354 CWARN("vmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
355 (unsigned long long) size, flags,
356 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
357 goto out;
358 }
359
360 if (flags & __GFP_ZERO)
361 memset(ptr, 0, size);
362
363 atomic64_add(size, &vmem_alloc_used);
364 if (unlikely(atomic64_read(&vmem_alloc_used) >
365 vmem_alloc_max))
366 vmem_alloc_max =
367 atomic64_read(&vmem_alloc_used);
368
369 INIT_HLIST_NODE(&dptr->kd_hlist);
370 INIT_LIST_HEAD(&dptr->kd_list);
371
372 dptr->kd_addr = ptr;
373 dptr->kd_size = size;
374 dptr->kd_line = line;
375
376 spin_lock_irqsave(&vmem_lock, irq_flags);
377 hlist_add_head_rcu(&dptr->kd_hlist,
378 &vmem_table[hash_ptr(ptr, VMEM_HASH_BITS)]);
379 list_add_tail(&dptr->kd_list, &vmem_list);
380 spin_unlock_irqrestore(&vmem_lock, irq_flags);
381
382 CDEBUG_LIMIT(D_INFO, "vmem_alloc(%llu, 0x%x) = %p "
383 "(%lld/%llu)\n", (unsigned long long) size, flags,
384 ptr, atomic64_read(&vmem_alloc_used),
385 vmem_alloc_max);
386 }
387 out:
388 RETURN(ptr);
389 }
390 EXPORT_SYMBOL(vmem_alloc_track);
391
392 void
393 vmem_free_track(void *ptr, size_t size)
394 {
395 kmem_debug_t *dptr;
396 ENTRY;
397
398 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
399 (unsigned long long) size);
400
401 dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr);
402 ASSERT(dptr); /* Must exist in hash due to vmem_alloc() */
403
404 /* Size must match */
405 ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
406 "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
407 (unsigned long long) size, dptr->kd_func, dptr->kd_line);
408
409 atomic64_sub(size, &vmem_alloc_used);
410 CDEBUG_LIMIT(D_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
411 (unsigned long long) size, atomic64_read(&vmem_alloc_used),
412 vmem_alloc_max);
413
414 kfree(dptr->kd_func);
415
416 memset(dptr, 0x5a, sizeof(kmem_debug_t));
417 kfree(dptr);
418
419 memset(ptr, 0x5a, size);
420 vfree(ptr);
421
422 EXIT;
423 }
424 EXPORT_SYMBOL(vmem_free_track);
425
426 # else /* DEBUG_KMEM_TRACKING */
427
428 void *
429 kmem_alloc_debug(size_t size, int flags, const char *func, int line,
430 int node_alloc, int node)
431 {
432 void *ptr;
433 ENTRY;
434
435 /* Marked unlikely because we should never be doing this,
436 * we tolerate to up 2 pages but a single page is best. */
437 if (unlikely(size > (PAGE_SIZE * 2)) && kmem_warning_flag)
438 CWARN("Large kmem_alloc(%llu, 0x%x) (%lld/%llu)\n",
439 (unsigned long long) size, flags,
440 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
441
442 /* Use the correct allocator */
443 if (node_alloc) {
444 ASSERT(!(flags & __GFP_ZERO));
445 ptr = kmalloc_node(size, flags, node);
446 } else if (flags & __GFP_ZERO) {
447 ptr = kzalloc(size, flags & (~__GFP_ZERO));
448 } else {
449 ptr = kmalloc(size, flags);
450 }
451
452 if (ptr == NULL) {
453 CWARN("kmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
454 (unsigned long long) size, flags,
455 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
456 } else {
457 atomic64_add(size, &kmem_alloc_used);
458 if (unlikely(atomic64_read(&kmem_alloc_used) > kmem_alloc_max))
459 kmem_alloc_max = atomic64_read(&kmem_alloc_used);
460
461 CDEBUG_LIMIT(D_INFO, "kmem_alloc(%llu, 0x%x) = %p "
462 "(%lld/%llu)\n", (unsigned long long) size, flags, ptr,
463 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
464 }
465 RETURN(ptr);
466 }
467 EXPORT_SYMBOL(kmem_alloc_debug);
468
469 void
470 kmem_free_debug(void *ptr, size_t size)
471 {
472 ENTRY;
473
474 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
475 (unsigned long long) size);
476
477 atomic64_sub(size, &kmem_alloc_used);
478
479 CDEBUG_LIMIT(D_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
480 (unsigned long long) size, atomic64_read(&kmem_alloc_used),
481 kmem_alloc_max);
482
483 memset(ptr, 0x5a, size);
484 kfree(ptr);
485
486 EXIT;
487 }
488 EXPORT_SYMBOL(kmem_free_debug);
489
490 void *
491 vmem_alloc_debug(size_t size, int flags, const char *func, int line)
492 {
493 void *ptr;
494 ENTRY;
495
496 ASSERT(flags & KM_SLEEP);
497
498 ptr = __vmalloc(size, (flags | __GFP_HIGHMEM) & ~__GFP_ZERO,
499 PAGE_KERNEL);
500 if (ptr == NULL) {
501 CWARN("vmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
502 (unsigned long long) size, flags,
503 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
504 } else {
505 if (flags & __GFP_ZERO)
506 memset(ptr, 0, size);
507
508 atomic64_add(size, &vmem_alloc_used);
509
510 if (unlikely(atomic64_read(&vmem_alloc_used) > vmem_alloc_max))
511 vmem_alloc_max = atomic64_read(&vmem_alloc_used);
512
513 CDEBUG_LIMIT(D_INFO, "vmem_alloc(%llu, 0x%x) = %p "
514 "(%lld/%llu)\n", (unsigned long long) size, flags, ptr,
515 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
516 }
517
518 RETURN(ptr);
519 }
520 EXPORT_SYMBOL(vmem_alloc_debug);
521
522 void
523 vmem_free_debug(void *ptr, size_t size)
524 {
525 ENTRY;
526
527 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
528 (unsigned long long) size);
529
530 atomic64_sub(size, &vmem_alloc_used);
531
532 CDEBUG_LIMIT(D_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
533 (unsigned long long) size, atomic64_read(&vmem_alloc_used),
534 vmem_alloc_max);
535
536 memset(ptr, 0x5a, size);
537 vfree(ptr);
538
539 EXIT;
540 }
541 EXPORT_SYMBOL(vmem_free_debug);
542
543 # endif /* DEBUG_KMEM_TRACKING */
544 #endif /* DEBUG_KMEM */
545
546 static void *
547 kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
548 {
549 void *ptr;
550
551 if (skc->skc_flags & KMC_KMEM) {
552 if (size > (2 * PAGE_SIZE)) {
553 ptr = (void *)__get_free_pages(flags, get_order(size));
554 } else
555 ptr = kmem_alloc(size, flags);
556 } else {
557 ptr = vmem_alloc(size, flags);
558 }
559
560 return ptr;
561 }
562
563 static void
564 kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
565 {
566 if (skc->skc_flags & KMC_KMEM) {
567 if (size > (2 * PAGE_SIZE))
568 free_pages((unsigned long)ptr, get_order(size));
569 else
570 kmem_free(ptr, size);
571 } else {
572 vmem_free(ptr, size);
573 }
574 }
575
576 static spl_kmem_slab_t *
577 spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
578 {
579 spl_kmem_slab_t *sks;
580 spl_kmem_obj_t *sko, *n;
581 void *base, *obj;
582 int i, size, rc = 0;
583
584 /* It's important that we pack the spl_kmem_obj_t structure
585 * and the actual objects in to one large address space
586 * to minimize the number of calls to the allocator. It
587 * is far better to do a few large allocations and then
588 * subdivide it ourselves. Now which allocator we use
589 * requires balancling a few trade offs.
590 *
591 * For small objects we use kmem_alloc() because as long
592 * as you are only requesting a small number of pages
593 * (ideally just one) its cheap. However, when you start
594 * requesting multiple pages kmem_alloc() get increasingly
595 * expensive since it requires contigeous pages. For this
596 * reason we shift to vmem_alloc() for slabs of large
597 * objects which removes the need for contigeous pages.
598 * We do not use vmem_alloc() in all cases because there
599 * is significant locking overhead in __get_vm_area_node().
600 * This function takes a single global lock when aquiring
601 * an available virtual address range which serialize all
602 * vmem_alloc()'s for all slab caches. Using slightly
603 * different allocation functions for small and large
604 * objects should give us the best of both worlds.
605 *
606 * sks struct: sizeof(spl_kmem_slab_t)
607 * obj data: skc->skc_obj_size
608 * obj struct: sizeof(spl_kmem_obj_t)
609 * <N obj data + obj structs>
610 *
611 * XXX: It would probably be a good idea to more carefully
612 * align these data structures in memory.
613 */
614 base = kv_alloc(skc, skc->skc_slab_size, flags);
615 if (base == NULL)
616 RETURN(NULL);
617
618 sks = (spl_kmem_slab_t *)base;
619 sks->sks_magic = SKS_MAGIC;
620 sks->sks_objs = skc->skc_slab_objs;
621 sks->sks_age = jiffies;
622 sks->sks_cache = skc;
623 INIT_LIST_HEAD(&sks->sks_list);
624 INIT_LIST_HEAD(&sks->sks_free_list);
625 sks->sks_ref = 0;
626 size = sizeof(spl_kmem_obj_t) + skc->skc_obj_size;
627
628 for (i = 0; i < sks->sks_objs; i++) {
629 if (skc->skc_flags & KMC_OFFSLAB) {
630 obj = kv_alloc(skc, size, flags);
631 if (!obj)
632 GOTO(out, rc = -ENOMEM);
633 } else {
634 obj = base + sizeof(spl_kmem_slab_t) + i * size;
635 }
636
637 sko = obj + skc->skc_obj_size;
638 sko->sko_addr = obj;
639 sko->sko_magic = SKO_MAGIC;
640 sko->sko_slab = sks;
641 INIT_LIST_HEAD(&sko->sko_list);
642 list_add_tail(&sko->sko_list, &sks->sks_free_list);
643 }
644
645 list_for_each_entry(sko, &sks->sks_free_list, sko_list)
646 if (skc->skc_ctor)
647 skc->skc_ctor(sko->sko_addr, skc->skc_private, flags);
648 out:
649 if (rc) {
650 if (skc->skc_flags & KMC_OFFSLAB)
651 list_for_each_entry_safe(sko,n,&sks->sks_free_list,sko_list)
652 kv_free(skc, sko->sko_addr, size);
653
654 kv_free(skc, base, skc->skc_slab_size);
655 sks = NULL;
656 }
657
658 RETURN(sks);
659 }
660
661 /* Removes slab from complete or partial list, so it must
662 * be called with the 'skc->skc_lock' held.
663 */
664 static void
665 spl_slab_free(spl_kmem_slab_t *sks) {
666 spl_kmem_cache_t *skc;
667 spl_kmem_obj_t *sko, *n;
668 int size;
669 ENTRY;
670
671 ASSERT(sks->sks_magic == SKS_MAGIC);
672 ASSERT(sks->sks_ref == 0);
673
674 skc = sks->sks_cache;
675 ASSERT(skc->skc_magic == SKC_MAGIC);
676 ASSERT(spin_is_locked(&skc->skc_lock));
677
678 skc->skc_obj_total -= sks->sks_objs;
679 skc->skc_slab_total--;
680 list_del(&sks->sks_list);
681 size = sizeof(spl_kmem_obj_t) + skc->skc_obj_size;
682
683 /* Run destructors slab is being released */
684 list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) {
685 ASSERT(sko->sko_magic == SKO_MAGIC);
686
687 if (skc->skc_dtor)
688 skc->skc_dtor(sko->sko_addr, skc->skc_private);
689
690 if (skc->skc_flags & KMC_OFFSLAB)
691 kv_free(skc, sko->sko_addr, size);
692 }
693
694 kv_free(skc, sks, skc->skc_slab_size);
695 EXIT;
696 }
697
698 static int
699 __spl_slab_reclaim(spl_kmem_cache_t *skc)
700 {
701 spl_kmem_slab_t *sks, *m;
702 int rc = 0;
703 ENTRY;
704
705 ASSERT(spin_is_locked(&skc->skc_lock));
706 /*
707 * Free empty slabs which have not been touched in skc_delay
708 * seconds. This delay time is important to avoid thrashing.
709 * Empty slabs will be at the end of the skc_partial_list.
710 */
711 list_for_each_entry_safe_reverse(sks, m, &skc->skc_partial_list,
712 sks_list) {
713 if (sks->sks_ref > 0)
714 break;
715
716 if (time_after(jiffies, sks->sks_age + skc->skc_delay * HZ)) {
717 spl_slab_free(sks);
718 rc++;
719 }
720 }
721
722 /* Returns number of slabs reclaimed */
723 RETURN(rc);
724 }
725
726 static int
727 spl_slab_reclaim(spl_kmem_cache_t *skc)
728 {
729 int rc;
730 ENTRY;
731
732 spin_lock(&skc->skc_lock);
733 rc = __spl_slab_reclaim(skc);
734 spin_unlock(&skc->skc_lock);
735
736 RETURN(rc);
737 }
738
739 static int
740 spl_magazine_size(spl_kmem_cache_t *skc)
741 {
742 int size;
743 ENTRY;
744
745 /* Guesses for reasonable magazine sizes, they
746 * should really adapt based on observed usage. */
747 if (skc->skc_obj_size > (PAGE_SIZE * 256))
748 size = 4;
749 else if (skc->skc_obj_size > (PAGE_SIZE * 32))
750 size = 16;
751 else if (skc->skc_obj_size > (PAGE_SIZE))
752 size = 64;
753 else if (skc->skc_obj_size > (PAGE_SIZE / 4))
754 size = 128;
755 else
756 size = 512;
757
758 RETURN(size);
759 }
760
761 static spl_kmem_magazine_t *
762 spl_magazine_alloc(spl_kmem_cache_t *skc, int node)
763 {
764 spl_kmem_magazine_t *skm;
765 int size = sizeof(spl_kmem_magazine_t) +
766 sizeof(void *) * skc->skc_mag_size;
767 ENTRY;
768
769 skm = kmem_alloc_node(size, GFP_KERNEL, node);
770 if (skm) {
771 skm->skm_magic = SKM_MAGIC;
772 skm->skm_avail = 0;
773 skm->skm_size = skc->skc_mag_size;
774 skm->skm_refill = skc->skc_mag_refill;
775 if (!(skc->skc_flags & KMC_NOTOUCH))
776 skm->skm_age = jiffies;
777 }
778
779 RETURN(skm);
780 }
781
782 static void
783 spl_magazine_free(spl_kmem_magazine_t *skm)
784 {
785 int size = sizeof(spl_kmem_magazine_t) +
786 sizeof(void *) * skm->skm_size;
787
788 ENTRY;
789 ASSERT(skm->skm_magic == SKM_MAGIC);
790 ASSERT(skm->skm_avail == 0);
791
792 kmem_free(skm, size);
793 EXIT;
794 }
795
796 static int
797 spl_magazine_create(spl_kmem_cache_t *skc)
798 {
799 int i;
800 ENTRY;
801
802 skc->skc_mag_size = spl_magazine_size(skc);
803 skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
804
805 for_each_online_cpu(i) {
806 skc->skc_mag[i] = spl_magazine_alloc(skc, cpu_to_node(i));
807 if (!skc->skc_mag[i]) {
808 for (i--; i >= 0; i--)
809 spl_magazine_free(skc->skc_mag[i]);
810
811 RETURN(-ENOMEM);
812 }
813 }
814
815 RETURN(0);
816 }
817
818 static void
819 spl_magazine_destroy(spl_kmem_cache_t *skc)
820 {
821 spl_kmem_magazine_t *skm;
822 int i;
823 ENTRY;
824
825 for_each_online_cpu(i) {
826 skm = skc->skc_mag[i];
827 (void)spl_cache_flush(skc, skm, skm->skm_avail);
828 spl_magazine_free(skm);
829 }
830
831 EXIT;
832 }
833
834 spl_kmem_cache_t *
835 spl_kmem_cache_create(char *name, size_t size, size_t align,
836 spl_kmem_ctor_t ctor,
837 spl_kmem_dtor_t dtor,
838 spl_kmem_reclaim_t reclaim,
839 void *priv, void *vmp, int flags)
840 {
841 spl_kmem_cache_t *skc;
842 uint32_t slab_max, slab_size, slab_objs;
843 int rc, kmem_flags = KM_SLEEP;
844 ENTRY;
845
846 ASSERTF(!(flags & KMC_NOMAGAZINE), "Bad KMC_NOMAGAZINE (%x)\n", flags);
847 ASSERTF(!(flags & KMC_NOHASH), "Bad KMC_NOHASH (%x)\n", flags);
848 ASSERTF(!(flags & KMC_QCACHE), "Bad KMC_QCACHE (%x)\n", flags);
849
850 /* We may be called when there is a non-zero preempt_count or
851 * interrupts are disabled is which case we must not sleep.
852 */
853 if (current_thread_info()->preempt_count || irqs_disabled())
854 kmem_flags = KM_NOSLEEP;
855
856 /* Allocate new cache memory and initialize. */
857 skc = (spl_kmem_cache_t *)kmem_zalloc(sizeof(*skc), kmem_flags);
858 if (skc == NULL)
859 RETURN(NULL);
860
861 skc->skc_magic = SKC_MAGIC;
862 skc->skc_name_size = strlen(name) + 1;
863 skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, kmem_flags);
864 if (skc->skc_name == NULL) {
865 kmem_free(skc, sizeof(*skc));
866 RETURN(NULL);
867 }
868 strncpy(skc->skc_name, name, skc->skc_name_size);
869
870 skc->skc_ctor = ctor;
871 skc->skc_dtor = dtor;
872 skc->skc_reclaim = reclaim;
873 skc->skc_private = priv;
874 skc->skc_vmp = vmp;
875 skc->skc_flags = flags;
876 skc->skc_obj_size = size;
877 skc->skc_delay = SPL_KMEM_CACHE_DELAY;
878
879 INIT_LIST_HEAD(&skc->skc_list);
880 INIT_LIST_HEAD(&skc->skc_complete_list);
881 INIT_LIST_HEAD(&skc->skc_partial_list);
882 spin_lock_init(&skc->skc_lock);
883 skc->skc_slab_fail = 0;
884 skc->skc_slab_create = 0;
885 skc->skc_slab_destroy = 0;
886 skc->skc_slab_total = 0;
887 skc->skc_slab_alloc = 0;
888 skc->skc_slab_max = 0;
889 skc->skc_obj_total = 0;
890 skc->skc_obj_alloc = 0;
891 skc->skc_obj_max = 0;
892
893 /* If none passed select a cache type based on object size */
894 if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM))) {
895 if (skc->skc_obj_size < (PAGE_SIZE / 8)) {
896 skc->skc_flags |= KMC_KMEM;
897 } else {
898 skc->skc_flags |= KMC_VMEM;
899 }
900 }
901
902 /* Size slabs properly so ensure they are not too large */
903 slab_max = ((uint64_t)1 << (MAX_ORDER - 1)) * PAGE_SIZE;
904 if (skc->skc_flags & KMC_OFFSLAB) {
905 skc->skc_slab_objs = SPL_KMEM_CACHE_OBJ_PER_SLAB;
906 skc->skc_slab_size = sizeof(spl_kmem_slab_t);
907 ASSERT(skc->skc_obj_size < slab_max);
908 } else {
909 slab_objs = SPL_KMEM_CACHE_OBJ_PER_SLAB + 1;
910
911 do {
912 slab_objs--;
913 slab_size = sizeof(spl_kmem_slab_t) + slab_objs *
914 (skc->skc_obj_size+sizeof(spl_kmem_obj_t));
915 } while (slab_size > slab_max);
916
917 skc->skc_slab_objs = slab_objs;
918 skc->skc_slab_size = slab_size;
919 }
920
921 rc = spl_magazine_create(skc);
922 if (rc) {
923 kmem_free(skc->skc_name, skc->skc_name_size);
924 kmem_free(skc, sizeof(*skc));
925 RETURN(NULL);
926 }
927
928 down_write(&spl_kmem_cache_sem);
929 list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
930 up_write(&spl_kmem_cache_sem);
931
932 RETURN(skc);
933 }
934 EXPORT_SYMBOL(spl_kmem_cache_create);
935
936 void
937 spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
938 {
939 spl_kmem_slab_t *sks, *m;
940 ENTRY;
941
942 ASSERT(skc->skc_magic == SKC_MAGIC);
943
944 down_write(&spl_kmem_cache_sem);
945 list_del_init(&skc->skc_list);
946 up_write(&spl_kmem_cache_sem);
947
948 spl_magazine_destroy(skc);
949 spin_lock(&skc->skc_lock);
950
951 /* Validate there are no objects in use and free all the
952 * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */
953 ASSERT(list_empty(&skc->skc_complete_list));
954 ASSERT(skc->skc_slab_alloc == 0);
955 ASSERT(skc->skc_obj_alloc == 0);
956
957 list_for_each_entry_safe(sks, m, &skc->skc_partial_list, sks_list)
958 spl_slab_free(sks);
959
960 ASSERT(skc->skc_slab_total == 0);
961 ASSERT(skc->skc_obj_total == 0);
962
963 kmem_free(skc->skc_name, skc->skc_name_size);
964 spin_unlock(&skc->skc_lock);
965
966 kmem_free(skc, sizeof(*skc));
967
968 EXIT;
969 }
970 EXPORT_SYMBOL(spl_kmem_cache_destroy);
971
972 static void *
973 spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
974 {
975 spl_kmem_obj_t *sko;
976
977 ASSERT(skc->skc_magic == SKC_MAGIC);
978 ASSERT(sks->sks_magic == SKS_MAGIC);
979 ASSERT(spin_is_locked(&skc->skc_lock));
980
981 sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
982 ASSERT(sko->sko_magic == SKO_MAGIC);
983 ASSERT(sko->sko_addr != NULL);
984
985 /* Remove from sks_free_list */
986 list_del_init(&sko->sko_list);
987
988 sks->sks_age = jiffies;
989 sks->sks_ref++;
990 skc->skc_obj_alloc++;
991
992 /* Track max obj usage statistics */
993 if (skc->skc_obj_alloc > skc->skc_obj_max)
994 skc->skc_obj_max = skc->skc_obj_alloc;
995
996 /* Track max slab usage statistics */
997 if (sks->sks_ref == 1) {
998 skc->skc_slab_alloc++;
999
1000 if (skc->skc_slab_alloc > skc->skc_slab_max)
1001 skc->skc_slab_max = skc->skc_slab_alloc;
1002 }
1003
1004 return sko->sko_addr;
1005 }
1006
1007 /* No available objects create a new slab. Since this is an
1008 * expensive operation we do it without holding the spinlock
1009 * and only briefly aquire it when we link in the fully
1010 * allocated and constructed slab.
1011 */
1012 static spl_kmem_slab_t *
1013 spl_cache_grow(spl_kmem_cache_t *skc, int flags)
1014 {
1015 spl_kmem_slab_t *sks;
1016 ENTRY;
1017
1018 ASSERT(skc->skc_magic == SKC_MAGIC);
1019
1020 if (flags & __GFP_WAIT) {
1021 flags |= __GFP_NOFAIL;
1022 local_irq_enable();
1023 might_sleep();
1024 }
1025
1026 sks = spl_slab_alloc(skc, flags);
1027 if (sks == NULL) {
1028 if (flags & __GFP_WAIT)
1029 local_irq_disable();
1030
1031 RETURN(NULL);
1032 }
1033
1034 if (flags & __GFP_WAIT)
1035 local_irq_disable();
1036
1037 /* Link the new empty slab in to the end of skc_partial_list */
1038 spin_lock(&skc->skc_lock);
1039 skc->skc_slab_total++;
1040 skc->skc_obj_total += sks->sks_objs;
1041 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1042 spin_unlock(&skc->skc_lock);
1043
1044 RETURN(sks);
1045 }
1046
1047 static int
1048 spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
1049 {
1050 spl_kmem_slab_t *sks;
1051 int rc = 0, refill;
1052 ENTRY;
1053
1054 ASSERT(skc->skc_magic == SKC_MAGIC);
1055 ASSERT(skm->skm_magic == SKM_MAGIC);
1056
1057 /* XXX: Check for refill bouncing by age perhaps */
1058 refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
1059
1060 spin_lock(&skc->skc_lock);
1061
1062 while (refill > 0) {
1063 /* No slabs available we must grow the cache */
1064 if (list_empty(&skc->skc_partial_list)) {
1065 spin_unlock(&skc->skc_lock);
1066
1067 sks = spl_cache_grow(skc, flags);
1068 if (!sks)
1069 GOTO(out, rc);
1070
1071 /* Rescheduled to different CPU skm is not local */
1072 if (skm != skc->skc_mag[smp_processor_id()])
1073 GOTO(out, rc);
1074
1075 /* Potentially rescheduled to the same CPU but
1076 * allocations may have occured from this CPU while
1077 * we were sleeping so recalculate max refill. */
1078 refill = MIN(refill, skm->skm_size - skm->skm_avail);
1079
1080 spin_lock(&skc->skc_lock);
1081 continue;
1082 }
1083
1084 /* Grab the next available slab */
1085 sks = list_entry((&skc->skc_partial_list)->next,
1086 spl_kmem_slab_t, sks_list);
1087 ASSERT(sks->sks_magic == SKS_MAGIC);
1088 ASSERT(sks->sks_ref < sks->sks_objs);
1089 ASSERT(!list_empty(&sks->sks_free_list));
1090
1091 /* Consume as many objects as needed to refill the requested
1092 * cache. We must also be careful not to overfill it. */
1093 while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++rc) {
1094 ASSERT(skm->skm_avail < skm->skm_size);
1095 ASSERT(rc < skm->skm_size);
1096 skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
1097 }
1098
1099 /* Move slab to skc_complete_list when full */
1100 if (sks->sks_ref == sks->sks_objs) {
1101 list_del(&sks->sks_list);
1102 list_add(&sks->sks_list, &skc->skc_complete_list);
1103 }
1104 }
1105
1106 spin_unlock(&skc->skc_lock);
1107 out:
1108 /* Returns the number of entries added to cache */
1109 RETURN(rc);
1110 }
1111
1112 static void
1113 spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
1114 {
1115 spl_kmem_slab_t *sks = NULL;
1116 spl_kmem_obj_t *sko = NULL;
1117 ENTRY;
1118
1119 ASSERT(skc->skc_magic == SKC_MAGIC);
1120 ASSERT(spin_is_locked(&skc->skc_lock));
1121
1122 sko = obj + skc->skc_obj_size;
1123 ASSERT(sko->sko_magic == SKO_MAGIC);
1124
1125 sks = sko->sko_slab;
1126 ASSERT(sks->sks_magic == SKS_MAGIC);
1127 ASSERT(sks->sks_cache == skc);
1128 list_add(&sko->sko_list, &sks->sks_free_list);
1129
1130 sks->sks_age = jiffies;
1131 sks->sks_ref--;
1132 skc->skc_obj_alloc--;
1133
1134 /* Move slab to skc_partial_list when no longer full. Slabs
1135 * are added to the head to keep the partial list is quasi-full
1136 * sorted order. Fuller at the head, emptier at the tail. */
1137 if (sks->sks_ref == (sks->sks_objs - 1)) {
1138 list_del(&sks->sks_list);
1139 list_add(&sks->sks_list, &skc->skc_partial_list);
1140 }
1141
1142 /* Move emply slabs to the end of the partial list so
1143 * they can be easily found and freed during reclamation. */
1144 if (sks->sks_ref == 0) {
1145 list_del(&sks->sks_list);
1146 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1147 skc->skc_slab_alloc--;
1148 }
1149
1150 EXIT;
1151 }
1152
1153 static int
1154 spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
1155 {
1156 int i, count = MIN(flush, skm->skm_avail);
1157 ENTRY;
1158
1159 ASSERT(skc->skc_magic == SKC_MAGIC);
1160 ASSERT(skm->skm_magic == SKM_MAGIC);
1161
1162 spin_lock(&skc->skc_lock);
1163
1164 for (i = 0; i < count; i++)
1165 spl_cache_shrink(skc, skm->skm_objs[i]);
1166
1167 // __spl_slab_reclaim(skc);
1168 skm->skm_avail -= count;
1169 memmove(skm->skm_objs, &(skm->skm_objs[count]),
1170 sizeof(void *) * skm->skm_avail);
1171
1172 spin_unlock(&skc->skc_lock);
1173
1174 RETURN(count);
1175 }
1176
1177 void *
1178 spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
1179 {
1180 spl_kmem_magazine_t *skm;
1181 unsigned long irq_flags;
1182 void *obj = NULL;
1183 int id;
1184 ENTRY;
1185
1186 ASSERT(skc->skc_magic == SKC_MAGIC);
1187 ASSERT(flags & KM_SLEEP); /* XXX: KM_NOSLEEP not yet supported */
1188 local_irq_save(irq_flags);
1189
1190 restart:
1191 /* Safe to update per-cpu structure without lock, but
1192 * in the restart case we must be careful to reaquire
1193 * the local magazine since this may have changed
1194 * when we need to grow the cache. */
1195 id = smp_processor_id();
1196 ASSERTF(id < 4, "cache=%p smp_processor_id=%d\n", skc, id);
1197 skm = skc->skc_mag[smp_processor_id()];
1198 ASSERTF(skm->skm_magic == SKM_MAGIC, "%x != %x: %s/%p/%p %x/%x/%x\n",
1199 skm->skm_magic, SKM_MAGIC, skc->skc_name, skc, skm,
1200 skm->skm_size, skm->skm_refill, skm->skm_avail);
1201
1202 if (likely(skm->skm_avail)) {
1203 /* Object available in CPU cache, use it */
1204 obj = skm->skm_objs[--skm->skm_avail];
1205 if (!(skc->skc_flags & KMC_NOTOUCH))
1206 skm->skm_age = jiffies;
1207 } else {
1208 /* Per-CPU cache empty, directly allocate from
1209 * the slab and refill the per-CPU cache. */
1210 (void)spl_cache_refill(skc, skm, flags);
1211 GOTO(restart, obj = NULL);
1212 }
1213
1214 local_irq_restore(irq_flags);
1215 ASSERT(obj);
1216
1217 /* Pre-emptively migrate object to CPU L1 cache */
1218 prefetchw(obj);
1219
1220 RETURN(obj);
1221 }
1222 EXPORT_SYMBOL(spl_kmem_cache_alloc);
1223
1224 void
1225 spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
1226 {
1227 spl_kmem_magazine_t *skm;
1228 unsigned long flags;
1229 ENTRY;
1230
1231 ASSERT(skc->skc_magic == SKC_MAGIC);
1232 local_irq_save(flags);
1233
1234 /* Safe to update per-cpu structure without lock, but
1235 * no remote memory allocation tracking is being performed
1236 * it is entirely possible to allocate an object from one
1237 * CPU cache and return it to another. */
1238 skm = skc->skc_mag[smp_processor_id()];
1239 ASSERT(skm->skm_magic == SKM_MAGIC);
1240
1241 /* Per-CPU cache full, flush it to make space */
1242 if (unlikely(skm->skm_avail >= skm->skm_size))
1243 (void)spl_cache_flush(skc, skm, skm->skm_refill);
1244
1245 /* Available space in cache, use it */
1246 skm->skm_objs[skm->skm_avail++] = obj;
1247
1248 local_irq_restore(flags);
1249
1250 EXIT;
1251 }
1252 EXPORT_SYMBOL(spl_kmem_cache_free);
1253
1254 static int
1255 spl_kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask)
1256 {
1257 spl_kmem_cache_t *skc;
1258
1259 /* Under linux a shrinker is not tightly coupled with a slab
1260 * cache. In fact linux always systematically trys calling all
1261 * registered shrinker callbacks until its target reclamation level
1262 * is reached. Because of this we only register one shrinker
1263 * function in the shim layer for all slab caches. And we always
1264 * attempt to shrink all caches when this generic shrinker is called.
1265 */
1266 down_read(&spl_kmem_cache_sem);
1267
1268 list_for_each_entry(skc, &spl_kmem_cache_list, skc_list)
1269 spl_kmem_cache_reap_now(skc);
1270
1271 up_read(&spl_kmem_cache_sem);
1272
1273 /* XXX: Under linux we should return the remaining number of
1274 * entries in the cache. We should do this as well.
1275 */
1276 return 1;
1277 }
1278
1279 void
1280 spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
1281 {
1282 spl_kmem_magazine_t *skm;
1283 int i;
1284 ENTRY;
1285
1286 ASSERT(skc->skc_magic == SKC_MAGIC);
1287
1288 if (skc->skc_reclaim)
1289 skc->skc_reclaim(skc->skc_private);
1290
1291 /* Ensure per-CPU caches which are idle gradually flush */
1292 for_each_online_cpu(i) {
1293 skm = skc->skc_mag[i];
1294
1295 if (time_after(jiffies, skm->skm_age + skc->skc_delay * HZ))
1296 (void)spl_cache_flush(skc, skm, skm->skm_refill);
1297 }
1298
1299 spl_slab_reclaim(skc);
1300
1301 EXIT;
1302 }
1303 EXPORT_SYMBOL(spl_kmem_cache_reap_now);
1304
1305 void
1306 spl_kmem_reap(void)
1307 {
1308 spl_kmem_cache_generic_shrinker(KMC_REAP_CHUNK, GFP_KERNEL);
1309 }
1310 EXPORT_SYMBOL(spl_kmem_reap);
1311
1312 #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
1313 static char *
1314 spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
1315 {
1316 int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
1317 int i, flag = 1;
1318
1319 ASSERT(str != NULL && len >= 17);
1320 memset(str, 0, len);
1321
1322 /* Check for a fully printable string, and while we are at
1323 * it place the printable characters in the passed buffer. */
1324 for (i = 0; i < size; i++) {
1325 str[i] = ((char *)(kd->kd_addr))[i];
1326 if (isprint(str[i])) {
1327 continue;
1328 } else {
1329 /* Minimum number of printable characters found
1330 * to make it worthwhile to print this as ascii. */
1331 if (i > min)
1332 break;
1333
1334 flag = 0;
1335 break;
1336 }
1337 }
1338
1339 if (!flag) {
1340 sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
1341 *((uint8_t *)kd->kd_addr),
1342 *((uint8_t *)kd->kd_addr + 2),
1343 *((uint8_t *)kd->kd_addr + 4),
1344 *((uint8_t *)kd->kd_addr + 6),
1345 *((uint8_t *)kd->kd_addr + 8),
1346 *((uint8_t *)kd->kd_addr + 10),
1347 *((uint8_t *)kd->kd_addr + 12),
1348 *((uint8_t *)kd->kd_addr + 14));
1349 }
1350
1351 return str;
1352 }
1353
1354 static int
1355 spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
1356 {
1357 int i;
1358 ENTRY;
1359
1360 spin_lock_init(lock);
1361 INIT_LIST_HEAD(list);
1362
1363 for (i = 0; i < size; i++)
1364 INIT_HLIST_HEAD(&kmem_table[i]);
1365
1366 RETURN(0);
1367 }
1368
1369 static void
1370 spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
1371 {
1372 unsigned long flags;
1373 kmem_debug_t *kd;
1374 char str[17];
1375 ENTRY;
1376
1377 spin_lock_irqsave(lock, flags);
1378 if (!list_empty(list))
1379 printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
1380 "size", "data", "func", "line");
1381
1382 list_for_each_entry(kd, list, kd_list)
1383 printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
1384 kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
1385 kd->kd_func, kd->kd_line);
1386
1387 spin_unlock_irqrestore(lock, flags);
1388 EXIT;
1389 }
1390 #else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
1391 #define spl_kmem_init_tracking(list, lock, size)
1392 #define spl_kmem_fini_tracking(list, lock)
1393 #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
1394
1395 int
1396 spl_kmem_init(void)
1397 {
1398 int rc = 0;
1399 ENTRY;
1400
1401 init_rwsem(&spl_kmem_cache_sem);
1402 INIT_LIST_HEAD(&spl_kmem_cache_list);
1403
1404 #ifdef HAVE_SET_SHRINKER
1405 spl_kmem_cache_shrinker = set_shrinker(KMC_DEFAULT_SEEKS,
1406 spl_kmem_cache_generic_shrinker);
1407 if (spl_kmem_cache_shrinker == NULL)
1408 RETURN(rc = -ENOMEM);
1409 #else
1410 register_shrinker(&spl_kmem_cache_shrinker);
1411 #endif
1412
1413 #ifdef DEBUG_KMEM
1414 atomic64_set(&kmem_alloc_used, 0);
1415 atomic64_set(&vmem_alloc_used, 0);
1416
1417 spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
1418 spl_kmem_init_tracking(&vmem_list, &vmem_lock, VMEM_TABLE_SIZE);
1419 #endif
1420 RETURN(rc);
1421 }
1422
1423 void
1424 spl_kmem_fini(void)
1425 {
1426 #ifdef DEBUG_KMEM
1427 /* Display all unreclaimed memory addresses, including the
1428 * allocation size and the first few bytes of what's located
1429 * at that address to aid in debugging. Performance is not
1430 * a serious concern here since it is module unload time. */
1431 if (atomic64_read(&kmem_alloc_used) != 0)
1432 CWARN("kmem leaked %ld/%ld bytes\n",
1433 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
1434
1435
1436 if (atomic64_read(&vmem_alloc_used) != 0)
1437 CWARN("vmem leaked %ld/%ld bytes\n",
1438 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
1439
1440 spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
1441 spl_kmem_fini_tracking(&vmem_list, &vmem_lock);
1442 #endif /* DEBUG_KMEM */
1443 ENTRY;
1444
1445 #ifdef HAVE_SET_SHRINKER
1446 remove_shrinker(spl_kmem_cache_shrinker);
1447 #else
1448 unregister_shrinker(&spl_kmem_cache_shrinker);
1449 #endif
1450
1451 EXIT;
1452 }