]> git.proxmox.com Git - mirror_spl.git/blob - modules/spl/spl-kmem.c
Add a SPL_AC_TYPE_ATOMIC64_T test to configure for systems which do
[mirror_spl.git] / modules / spl / spl-kmem.c
1 /*
2 * This file is part of the SPL: Solaris Porting Layer.
3 *
4 * Copyright (c) 2008 Lawrence Livermore National Security, LLC.
5 * Produced at Lawrence Livermore National Laboratory
6 * Written by:
7 * Brian Behlendorf <behlendorf1@llnl.gov>,
8 * Herb Wartens <wartens2@llnl.gov>,
9 * Jim Garlick <garlick@llnl.gov>
10 * UCRL-CODE-235197
11 *
12 * This is free software; you can redistribute it and/or modify it
13 * under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This is distributed in the hope that it will be useful, but WITHOUT
18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
20 * for more details.
21 *
22 * You should have received a copy of the GNU General Public License along
23 * with this program; if not, write to the Free Software Foundation, Inc.,
24 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 */
26
27 #include <sys/kmem.h>
28
29 #ifdef DEBUG_SUBSYSTEM
30 # undef DEBUG_SUBSYSTEM
31 #endif
32
33 #define DEBUG_SUBSYSTEM S_KMEM
34
35 /*
36 * Memory allocation interfaces and debugging for basic kmem_*
37 * and vmem_* style memory allocation. When DEBUG_KMEM is enable
38 * all allocations will be tracked when they are allocated and
39 * freed. When the SPL module is unload a list of all leaked
40 * addresses and where they were allocated will be dumped to the
41 * console. Enabling this feature has a significant impant on
42 * performance but it makes finding memory leaks staight forward.
43 */
44 #ifdef DEBUG_KMEM
45 /* Shim layer memory accounting */
46 atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
47 unsigned long long kmem_alloc_max = 0;
48 atomic64_t vmem_alloc_used = ATOMIC64_INIT(0);
49 unsigned long long vmem_alloc_max = 0;
50 int kmem_warning_flag = 1;
51
52 EXPORT_SYMBOL(kmem_alloc_used);
53 EXPORT_SYMBOL(kmem_alloc_max);
54 EXPORT_SYMBOL(vmem_alloc_used);
55 EXPORT_SYMBOL(vmem_alloc_max);
56 EXPORT_SYMBOL(kmem_warning_flag);
57
58 # ifdef DEBUG_KMEM_TRACKING
59
60 /* XXX - Not to surprisingly with debugging enabled the xmem_locks are very
61 * highly contended particularly on xfree(). If we want to run with this
62 * detailed debugging enabled for anything other than debugging we need to
63 * minimize the contention by moving to a lock per xmem_table entry model.
64 */
65
66 # define KMEM_HASH_BITS 10
67 # define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS)
68
69 # define VMEM_HASH_BITS 10
70 # define VMEM_TABLE_SIZE (1 << VMEM_HASH_BITS)
71
72 typedef struct kmem_debug {
73 struct hlist_node kd_hlist; /* Hash node linkage */
74 struct list_head kd_list; /* List of all allocations */
75 void *kd_addr; /* Allocation pointer */
76 size_t kd_size; /* Allocation size */
77 const char *kd_func; /* Allocation function */
78 int kd_line; /* Allocation line */
79 } kmem_debug_t;
80
81 spinlock_t kmem_lock;
82 struct hlist_head kmem_table[KMEM_TABLE_SIZE];
83 struct list_head kmem_list;
84
85 spinlock_t vmem_lock;
86 struct hlist_head vmem_table[VMEM_TABLE_SIZE];
87 struct list_head vmem_list;
88
89 EXPORT_SYMBOL(kmem_lock);
90 EXPORT_SYMBOL(kmem_table);
91 EXPORT_SYMBOL(kmem_list);
92
93 EXPORT_SYMBOL(vmem_lock);
94 EXPORT_SYMBOL(vmem_table);
95 EXPORT_SYMBOL(vmem_list);
96 # endif
97
98 int kmem_set_warning(int flag) { return (kmem_warning_flag = !!flag); }
99 #else
100 int kmem_set_warning(int flag) { return 0; }
101 #endif
102 EXPORT_SYMBOL(kmem_set_warning);
103
104 /*
105 * Slab allocation interfaces
106 *
107 * While the Linux slab implementation was inspired by the Solaris
108 * implemenation I cannot use it to emulate the Solaris APIs. I
109 * require two features which are not provided by the Linux slab.
110 *
111 * 1) Constructors AND destructors. Recent versions of the Linux
112 * kernel have removed support for destructors. This is a deal
113 * breaker for the SPL which contains particularly expensive
114 * initializers for mutex's, condition variables, etc. We also
115 * require a minimal level of cleanup for these data types unlike
116 * many Linux data type which do need to be explicitly destroyed.
117 *
118 * 2) Virtual address space backed slab. Callers of the Solaris slab
119 * expect it to work well for both small are very large allocations.
120 * Because of memory fragmentation the Linux slab which is backed
121 * by kmalloc'ed memory performs very badly when confronted with
122 * large numbers of large allocations. Basing the slab on the
123 * virtual address space removes the need for contigeous pages
124 * and greatly improve performance for large allocations.
125 *
126 * For these reasons, the SPL has its own slab implementation with
127 * the needed features. It is not as highly optimized as either the
128 * Solaris or Linux slabs, but it should get me most of what is
129 * needed until it can be optimized or obsoleted by another approach.
130 *
131 * One serious concern I do have about this method is the relatively
132 * small virtual address space on 32bit arches. This will seriously
133 * constrain the size of the slab caches and their performance.
134 *
135 * XXX: Implement work requests to keep an eye on each cache and
136 * shrink them via spl_slab_reclaim() when they are wasting lots
137 * of space. Currently this process is driven by the reapers.
138 *
139 * XXX: Improve the partial slab list by carefully maintaining a
140 * strict ordering of fullest to emptiest slabs based on
141 * the slab reference count. This gaurentees the when freeing
142 * slabs back to the system we need only linearly traverse the
143 * last N slabs in the list to discover all the freeable slabs.
144 *
145 * XXX: NUMA awareness for optionally allocating memory close to a
146 * particular core. This can be adventageous if you know the slab
147 * object will be short lived and primarily accessed from one core.
148 *
149 * XXX: Slab coloring may also yield performance improvements and would
150 * be desirable to implement.
151 *
152 * XXX: Proper hardware cache alignment would be good too.
153 */
154
155 struct list_head spl_kmem_cache_list; /* List of caches */
156 struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
157
158 static int spl_cache_flush(spl_kmem_cache_t *skc,
159 spl_kmem_magazine_t *skm, int flush);
160
161 #ifdef HAVE_SET_SHRINKER
162 static struct shrinker *spl_kmem_cache_shrinker;
163 #else
164 static int spl_kmem_cache_generic_shrinker(int nr_to_scan,
165 unsigned int gfp_mask);
166 static struct shrinker spl_kmem_cache_shrinker = {
167 .shrink = spl_kmem_cache_generic_shrinker,
168 .seeks = KMC_DEFAULT_SEEKS,
169 };
170 #endif
171
172 #ifdef DEBUG_KMEM
173 # ifdef DEBUG_KMEM_TRACKING
174
175 static kmem_debug_t *
176 kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits,
177 void *addr)
178 {
179 struct hlist_head *head;
180 struct hlist_node *node;
181 struct kmem_debug *p;
182 unsigned long flags;
183 ENTRY;
184
185 spin_lock_irqsave(lock, flags);
186
187 head = &table[hash_ptr(addr, bits)];
188 hlist_for_each_entry_rcu(p, node, head, kd_hlist) {
189 if (p->kd_addr == addr) {
190 hlist_del_init(&p->kd_hlist);
191 list_del_init(&p->kd_list);
192 spin_unlock_irqrestore(lock, flags);
193 return p;
194 }
195 }
196
197 spin_unlock_irqrestore(lock, flags);
198
199 RETURN(NULL);
200 }
201
202 void *
203 kmem_alloc_track(size_t size, int flags, const char *func, int line,
204 int node_alloc, int node)
205 {
206 void *ptr = NULL;
207 kmem_debug_t *dptr;
208 unsigned long irq_flags;
209 ENTRY;
210
211 dptr = (kmem_debug_t *) kmalloc(sizeof(kmem_debug_t),
212 flags & ~__GFP_ZERO);
213
214 if (dptr == NULL) {
215 CWARN("kmem_alloc(%ld, 0x%x) debug failed\n",
216 sizeof(kmem_debug_t), flags);
217 } else {
218 /* Marked unlikely because we should never be doing this,
219 * we tolerate to up 2 pages but a single page is best. */
220 if (unlikely((size) > (PAGE_SIZE * 2)) && kmem_warning_flag)
221 CWARN("Large kmem_alloc(%llu, 0x%x) (%lld/%llu)\n",
222 (unsigned long long) size, flags,
223 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
224
225 /* Use the correct allocator */
226 if (node_alloc) {
227 ASSERT(!(flags & __GFP_ZERO));
228 ptr = kmalloc_node(size, flags, node);
229 } else if (flags & __GFP_ZERO) {
230 ptr = kzalloc(size, flags & ~__GFP_ZERO);
231 } else {
232 ptr = kmalloc(size, flags);
233 }
234
235 if (unlikely(ptr == NULL)) {
236 kfree(dptr);
237 CWARN("kmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
238 (unsigned long long) size, flags,
239 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
240 goto out;
241 }
242
243 atomic64_add(size, &kmem_alloc_used);
244 if (unlikely(atomic64_read(&kmem_alloc_used) >
245 kmem_alloc_max))
246 kmem_alloc_max =
247 atomic64_read(&kmem_alloc_used);
248
249 INIT_HLIST_NODE(&dptr->kd_hlist);
250 INIT_LIST_HEAD(&dptr->kd_list);
251
252 dptr->kd_addr = ptr;
253 dptr->kd_size = size;
254 dptr->kd_func = func;
255 dptr->kd_line = line;
256
257 spin_lock_irqsave(&kmem_lock, irq_flags);
258 hlist_add_head_rcu(&dptr->kd_hlist,
259 &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
260 list_add_tail(&dptr->kd_list, &kmem_list);
261 spin_unlock_irqrestore(&kmem_lock, irq_flags);
262
263 CDEBUG_LIMIT(D_INFO, "kmem_alloc(%llu, 0x%x) = %p "
264 "(%lld/%llu)\n", (unsigned long long) size, flags,
265 ptr, atomic64_read(&kmem_alloc_used),
266 kmem_alloc_max);
267 }
268 out:
269 RETURN(ptr);
270 }
271 EXPORT_SYMBOL(kmem_alloc_track);
272
273 void
274 kmem_free_track(void *ptr, size_t size)
275 {
276 kmem_debug_t *dptr;
277 ENTRY;
278
279 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
280 (unsigned long long) size);
281
282 dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
283
284 ASSERT(dptr); /* Must exist in hash due to kmem_alloc() */
285
286 /* Size must match */
287 ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
288 "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
289 (unsigned long long) size, dptr->kd_func, dptr->kd_line);
290
291 atomic64_sub(size, &kmem_alloc_used);
292
293 CDEBUG_LIMIT(D_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
294 (unsigned long long) size, atomic64_read(&kmem_alloc_used),
295 kmem_alloc_max);
296
297 memset(dptr, 0x5a, sizeof(kmem_debug_t));
298 kfree(dptr);
299
300 memset(ptr, 0x5a, size);
301 kfree(ptr);
302
303 EXIT;
304 }
305 EXPORT_SYMBOL(kmem_free_track);
306
307 void *
308 vmem_alloc_track(size_t size, int flags, const char *func, int line)
309 {
310 void *ptr = NULL;
311 kmem_debug_t *dptr;
312 unsigned long irq_flags;
313 ENTRY;
314
315 ASSERT(flags & KM_SLEEP);
316
317 dptr = (kmem_debug_t *) kmalloc(sizeof(kmem_debug_t), flags);
318 if (dptr == NULL) {
319 CWARN("vmem_alloc(%ld, 0x%x) debug failed\n",
320 sizeof(kmem_debug_t), flags);
321 } else {
322 ptr = __vmalloc(size, (flags | __GFP_HIGHMEM) & ~__GFP_ZERO,
323 PAGE_KERNEL);
324
325 if (unlikely(ptr == NULL)) {
326 kfree(dptr);
327 CWARN("vmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
328 (unsigned long long) size, flags,
329 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
330 goto out;
331 }
332
333 if (flags & __GFP_ZERO)
334 memset(ptr, 0, size);
335
336 atomic64_add(size, &vmem_alloc_used);
337 if (unlikely(atomic64_read(&vmem_alloc_used) >
338 vmem_alloc_max))
339 vmem_alloc_max =
340 atomic64_read(&vmem_alloc_used);
341
342 INIT_HLIST_NODE(&dptr->kd_hlist);
343 INIT_LIST_HEAD(&dptr->kd_list);
344
345 dptr->kd_addr = ptr;
346 dptr->kd_size = size;
347 dptr->kd_func = func;
348 dptr->kd_line = line;
349
350 spin_lock_irqsave(&vmem_lock, irq_flags);
351 hlist_add_head_rcu(&dptr->kd_hlist,
352 &vmem_table[hash_ptr(ptr, VMEM_HASH_BITS)]);
353 list_add_tail(&dptr->kd_list, &vmem_list);
354 spin_unlock_irqrestore(&vmem_lock, irq_flags);
355
356 CDEBUG_LIMIT(D_INFO, "vmem_alloc(%llu, 0x%x) = %p "
357 "(%lld/%llu)\n", (unsigned long long) size, flags,
358 ptr, atomic64_read(&vmem_alloc_used),
359 vmem_alloc_max);
360 }
361 out:
362 RETURN(ptr);
363 }
364 EXPORT_SYMBOL(vmem_alloc_track);
365
366 void
367 vmem_free_track(void *ptr, size_t size)
368 {
369 kmem_debug_t *dptr;
370 ENTRY;
371
372 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
373 (unsigned long long) size);
374
375 dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr);
376 ASSERT(dptr); /* Must exist in hash due to vmem_alloc() */
377
378 /* Size must match */
379 ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
380 "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
381 (unsigned long long) size, dptr->kd_func, dptr->kd_line);
382
383 atomic64_sub(size, &vmem_alloc_used);
384 CDEBUG_LIMIT(D_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
385 (unsigned long long) size, atomic64_read(&vmem_alloc_used),
386 vmem_alloc_max);
387
388 memset(dptr, 0x5a, sizeof(kmem_debug_t));
389 kfree(dptr);
390
391 memset(ptr, 0x5a, size);
392 vfree(ptr);
393
394 EXIT;
395 }
396 EXPORT_SYMBOL(vmem_free_track);
397
398 # else /* DEBUG_KMEM_TRACKING */
399
400 void *
401 kmem_alloc_debug(size_t size, int flags, const char *func, int line,
402 int node_alloc, int node)
403 {
404 void *ptr;
405 ENTRY;
406
407 /* Marked unlikely because we should never be doing this,
408 * we tolerate to up 2 pages but a single page is best. */
409 if (unlikely(size > (PAGE_SIZE * 2)) && kmem_warning_flag)
410 CWARN("Large kmem_alloc(%llu, 0x%x) (%lld/%llu)\n",
411 (unsigned long long) size, flags,
412 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
413
414 /* Use the correct allocator */
415 if (node_alloc) {
416 ASSERT(!(flags & __GFP_ZERO));
417 ptr = kmalloc_node(size, flags, node);
418 } else if (flags & __GFP_ZERO) {
419 ptr = kzalloc(size, flags & (~__GFP_ZERO));
420 } else {
421 ptr = kmalloc(size, flags);
422 }
423
424 if (ptr == NULL) {
425 CWARN("kmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
426 (unsigned long long) size, flags,
427 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
428 } else {
429 atomic64_add(size, &kmem_alloc_used);
430 if (unlikely(atomic64_read(&kmem_alloc_used) > kmem_alloc_max))
431 kmem_alloc_max = atomic64_read(&kmem_alloc_used);
432
433 CDEBUG_LIMIT(D_INFO, "kmem_alloc(%llu, 0x%x) = %p "
434 "(%lld/%llu)\n", (unsigned long long) size, flags, ptr,
435 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
436 }
437 RETURN(ptr);
438 }
439 EXPORT_SYMBOL(kmem_alloc_debug);
440
441 void
442 kmem_free_debug(void *ptr, size_t size)
443 {
444 ENTRY;
445
446 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
447 (unsigned long long) size);
448
449 atomic64_sub(size, &kmem_alloc_used);
450
451 CDEBUG_LIMIT(D_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
452 (unsigned long long) size, atomic64_read(&kmem_alloc_used),
453 kmem_alloc_max);
454
455 memset(ptr, 0x5a, size);
456 kfree(ptr);
457
458 EXIT;
459 }
460 EXPORT_SYMBOL(kmem_free_debug);
461
462 void *
463 vmem_alloc_debug(size_t size, int flags, const char *func, int line)
464 {
465 void *ptr;
466 ENTRY;
467
468 ASSERT(flags & KM_SLEEP);
469
470 ptr = __vmalloc(size, (flags | __GFP_HIGHMEM) & ~__GFP_ZERO,
471 PAGE_KERNEL);
472 if (ptr == NULL) {
473 CWARN("vmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
474 (unsigned long long) size, flags,
475 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
476 } else {
477 if (flags & __GFP_ZERO)
478 memset(ptr, 0, size);
479
480 atomic64_add(size, &vmem_alloc_used);
481
482 if (unlikely(atomic64_read(&vmem_alloc_used) > vmem_alloc_max))
483 vmem_alloc_max = atomic64_read(&vmem_alloc_used);
484
485 CDEBUG_LIMIT(D_INFO, "vmem_alloc(%llu, 0x%x) = %p "
486 "(%lld/%llu)\n", (unsigned long long) size, flags, ptr,
487 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
488 }
489
490 RETURN(ptr);
491 }
492 EXPORT_SYMBOL(vmem_alloc_debug);
493
494 void
495 vmem_free_debug(void *ptr, size_t size)
496 {
497 ENTRY;
498
499 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
500 (unsigned long long) size);
501
502 atomic64_sub(size, &vmem_alloc_used);
503
504 CDEBUG_LIMIT(D_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
505 (unsigned long long) size, atomic64_read(&vmem_alloc_used),
506 vmem_alloc_max);
507
508 memset(ptr, 0x5a, size);
509 vfree(ptr);
510
511 EXIT;
512 }
513 EXPORT_SYMBOL(vmem_free_debug);
514
515 # endif /* DEBUG_KMEM_TRACKING */
516 #endif /* DEBUG_KMEM */
517
518 static void *
519 kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
520 {
521 void *ptr;
522
523 if (skc->skc_flags & KMC_KMEM) {
524 if (size > (2 * PAGE_SIZE)) {
525 ptr = (void *)__get_free_pages(flags, get_order(size));
526 } else
527 ptr = kmem_alloc(size, flags);
528 } else {
529 ptr = vmem_alloc(size, flags);
530 }
531
532 return ptr;
533 }
534
535 static void
536 kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
537 {
538 if (skc->skc_flags & KMC_KMEM) {
539 if (size > (2 * PAGE_SIZE))
540 free_pages((unsigned long)ptr, get_order(size));
541 else
542 kmem_free(ptr, size);
543 } else {
544 vmem_free(ptr, size);
545 }
546 }
547
548 static spl_kmem_slab_t *
549 spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
550 {
551 spl_kmem_slab_t *sks;
552 spl_kmem_obj_t *sko, *n;
553 void *base, *obj;
554 int i, size, rc = 0;
555
556 /* It's important that we pack the spl_kmem_obj_t structure
557 * and the actual objects in to one large address space
558 * to minimize the number of calls to the allocator. It
559 * is far better to do a few large allocations and then
560 * subdivide it ourselves. Now which allocator we use
561 * requires balancling a few trade offs.
562 *
563 * For small objects we use kmem_alloc() because as long
564 * as you are only requesting a small number of pages
565 * (ideally just one) its cheap. However, when you start
566 * requesting multiple pages kmem_alloc() get increasingly
567 * expensive since it requires contigeous pages. For this
568 * reason we shift to vmem_alloc() for slabs of large
569 * objects which removes the need for contigeous pages.
570 * We do not use vmem_alloc() in all cases because there
571 * is significant locking overhead in __get_vm_area_node().
572 * This function takes a single global lock when aquiring
573 * an available virtual address range which serialize all
574 * vmem_alloc()'s for all slab caches. Using slightly
575 * different allocation functions for small and large
576 * objects should give us the best of both worlds.
577 *
578 * sks struct: sizeof(spl_kmem_slab_t)
579 * obj data: skc->skc_obj_size
580 * obj struct: sizeof(spl_kmem_obj_t)
581 * <N obj data + obj structs>
582 *
583 * XXX: It would probably be a good idea to more carefully
584 * align these data structures in memory.
585 */
586 base = kv_alloc(skc, skc->skc_slab_size, flags);
587 if (base == NULL)
588 RETURN(NULL);
589
590 sks = (spl_kmem_slab_t *)base;
591 sks->sks_magic = SKS_MAGIC;
592 sks->sks_objs = skc->skc_slab_objs;
593 sks->sks_age = jiffies;
594 sks->sks_cache = skc;
595 INIT_LIST_HEAD(&sks->sks_list);
596 INIT_LIST_HEAD(&sks->sks_free_list);
597 sks->sks_ref = 0;
598 size = sizeof(spl_kmem_obj_t) + skc->skc_obj_size;
599
600 for (i = 0; i < sks->sks_objs; i++) {
601 if (skc->skc_flags & KMC_OFFSLAB) {
602 obj = kv_alloc(skc, size, flags);
603 if (!obj)
604 GOTO(out, rc = -ENOMEM);
605 } else {
606 obj = base + sizeof(spl_kmem_slab_t) + i * size;
607 }
608
609 sko = obj + skc->skc_obj_size;
610 sko->sko_addr = obj;
611 sko->sko_magic = SKO_MAGIC;
612 sko->sko_slab = sks;
613 INIT_LIST_HEAD(&sko->sko_list);
614 list_add_tail(&sko->sko_list, &sks->sks_free_list);
615 }
616
617 list_for_each_entry(sko, &sks->sks_free_list, sko_list)
618 if (skc->skc_ctor)
619 skc->skc_ctor(sko->sko_addr, skc->skc_private, flags);
620 out:
621 if (rc) {
622 if (skc->skc_flags & KMC_OFFSLAB)
623 list_for_each_entry_safe(sko,n,&sks->sks_free_list,sko_list)
624 kv_free(skc, sko->sko_addr, size);
625
626 kv_free(skc, base, skc->skc_slab_size);
627 sks = NULL;
628 }
629
630 RETURN(sks);
631 }
632
633 /* Removes slab from complete or partial list, so it must
634 * be called with the 'skc->skc_lock' held.
635 */
636 static void
637 spl_slab_free(spl_kmem_slab_t *sks) {
638 spl_kmem_cache_t *skc;
639 spl_kmem_obj_t *sko, *n;
640 int size;
641 ENTRY;
642
643 ASSERT(sks->sks_magic == SKS_MAGIC);
644 ASSERT(sks->sks_ref == 0);
645
646 skc = sks->sks_cache;
647 ASSERT(skc->skc_magic == SKC_MAGIC);
648 ASSERT(spin_is_locked(&skc->skc_lock));
649
650 skc->skc_obj_total -= sks->sks_objs;
651 skc->skc_slab_total--;
652 list_del(&sks->sks_list);
653 size = sizeof(spl_kmem_obj_t) + skc->skc_obj_size;
654
655 /* Run destructors slab is being released */
656 list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) {
657 ASSERT(sko->sko_magic == SKO_MAGIC);
658
659 if (skc->skc_dtor)
660 skc->skc_dtor(sko->sko_addr, skc->skc_private);
661
662 if (skc->skc_flags & KMC_OFFSLAB)
663 kv_free(skc, sko->sko_addr, size);
664 }
665
666 kv_free(skc, sks, skc->skc_slab_size);
667 EXIT;
668 }
669
670 static int
671 __spl_slab_reclaim(spl_kmem_cache_t *skc)
672 {
673 spl_kmem_slab_t *sks, *m;
674 int rc = 0;
675 ENTRY;
676
677 ASSERT(spin_is_locked(&skc->skc_lock));
678 /*
679 * Free empty slabs which have not been touched in skc_delay
680 * seconds. This delay time is important to avoid thrashing.
681 * Empty slabs will be at the end of the skc_partial_list.
682 */
683 list_for_each_entry_safe_reverse(sks, m, &skc->skc_partial_list,
684 sks_list) {
685 if (sks->sks_ref > 0)
686 break;
687
688 if (time_after(jiffies, sks->sks_age + skc->skc_delay * HZ)) {
689 spl_slab_free(sks);
690 rc++;
691 }
692 }
693
694 /* Returns number of slabs reclaimed */
695 RETURN(rc);
696 }
697
698 static int
699 spl_slab_reclaim(spl_kmem_cache_t *skc)
700 {
701 int rc;
702 ENTRY;
703
704 spin_lock(&skc->skc_lock);
705 rc = __spl_slab_reclaim(skc);
706 spin_unlock(&skc->skc_lock);
707
708 RETURN(rc);
709 }
710
711 static int
712 spl_magazine_size(spl_kmem_cache_t *skc)
713 {
714 int size;
715 ENTRY;
716
717 /* Guesses for reasonable magazine sizes, they
718 * should really adapt based on observed usage. */
719 if (skc->skc_obj_size > (PAGE_SIZE * 256))
720 size = 4;
721 else if (skc->skc_obj_size > (PAGE_SIZE * 32))
722 size = 16;
723 else if (skc->skc_obj_size > (PAGE_SIZE))
724 size = 64;
725 else if (skc->skc_obj_size > (PAGE_SIZE / 4))
726 size = 128;
727 else
728 size = 512;
729
730 RETURN(size);
731 }
732
733 static spl_kmem_magazine_t *
734 spl_magazine_alloc(spl_kmem_cache_t *skc, int node)
735 {
736 spl_kmem_magazine_t *skm;
737 int size = sizeof(spl_kmem_magazine_t) +
738 sizeof(void *) * skc->skc_mag_size;
739 ENTRY;
740
741 skm = kmem_alloc_node(size, GFP_KERNEL, node);
742 if (skm) {
743 skm->skm_magic = SKM_MAGIC;
744 skm->skm_avail = 0;
745 skm->skm_size = skc->skc_mag_size;
746 skm->skm_refill = skc->skc_mag_refill;
747 if (!(skc->skc_flags & KMC_NOTOUCH))
748 skm->skm_age = jiffies;
749 }
750
751 RETURN(skm);
752 }
753
754 static void
755 spl_magazine_free(spl_kmem_magazine_t *skm)
756 {
757 int size = sizeof(spl_kmem_magazine_t) +
758 sizeof(void *) * skm->skm_size;
759
760 ENTRY;
761 ASSERT(skm->skm_magic == SKM_MAGIC);
762 ASSERT(skm->skm_avail == 0);
763
764 kmem_free(skm, size);
765 EXIT;
766 }
767
768 static int
769 spl_magazine_create(spl_kmem_cache_t *skc)
770 {
771 int i;
772 ENTRY;
773
774 skc->skc_mag_size = spl_magazine_size(skc);
775 skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
776
777 for_each_online_cpu(i) {
778 skc->skc_mag[i] = spl_magazine_alloc(skc, cpu_to_node(i));
779 if (!skc->skc_mag[i]) {
780 for (i--; i >= 0; i--)
781 spl_magazine_free(skc->skc_mag[i]);
782
783 RETURN(-ENOMEM);
784 }
785 }
786
787 RETURN(0);
788 }
789
790 static void
791 spl_magazine_destroy(spl_kmem_cache_t *skc)
792 {
793 spl_kmem_magazine_t *skm;
794 int i;
795 ENTRY;
796
797 for_each_online_cpu(i) {
798 skm = skc->skc_mag[i];
799 (void)spl_cache_flush(skc, skm, skm->skm_avail);
800 spl_magazine_free(skm);
801 }
802
803 EXIT;
804 }
805
806 spl_kmem_cache_t *
807 spl_kmem_cache_create(char *name, size_t size, size_t align,
808 spl_kmem_ctor_t ctor,
809 spl_kmem_dtor_t dtor,
810 spl_kmem_reclaim_t reclaim,
811 void *priv, void *vmp, int flags)
812 {
813 spl_kmem_cache_t *skc;
814 uint32_t slab_max, slab_size, slab_objs;
815 int rc, kmem_flags = KM_SLEEP;
816 ENTRY;
817
818 ASSERTF(!(flags & KMC_NOMAGAZINE), "Bad KMC_NOMAGAZINE (%x)\n", flags);
819 ASSERTF(!(flags & KMC_NOHASH), "Bad KMC_NOHASH (%x)\n", flags);
820 ASSERTF(!(flags & KMC_QCACHE), "Bad KMC_QCACHE (%x)\n", flags);
821
822 /* We may be called when there is a non-zero preempt_count or
823 * interrupts are disabled is which case we must not sleep.
824 */
825 if (current_thread_info()->preempt_count || irqs_disabled())
826 kmem_flags = KM_NOSLEEP;
827
828 /* Allocate new cache memory and initialize. */
829 skc = (spl_kmem_cache_t *)kmem_zalloc(sizeof(*skc), kmem_flags);
830 if (skc == NULL)
831 RETURN(NULL);
832
833 skc->skc_magic = SKC_MAGIC;
834 skc->skc_name_size = strlen(name) + 1;
835 skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, kmem_flags);
836 if (skc->skc_name == NULL) {
837 kmem_free(skc, sizeof(*skc));
838 RETURN(NULL);
839 }
840 strncpy(skc->skc_name, name, skc->skc_name_size);
841
842 skc->skc_ctor = ctor;
843 skc->skc_dtor = dtor;
844 skc->skc_reclaim = reclaim;
845 skc->skc_private = priv;
846 skc->skc_vmp = vmp;
847 skc->skc_flags = flags;
848 skc->skc_obj_size = size;
849 skc->skc_delay = SPL_KMEM_CACHE_DELAY;
850
851 INIT_LIST_HEAD(&skc->skc_list);
852 INIT_LIST_HEAD(&skc->skc_complete_list);
853 INIT_LIST_HEAD(&skc->skc_partial_list);
854 spin_lock_init(&skc->skc_lock);
855 skc->skc_slab_fail = 0;
856 skc->skc_slab_create = 0;
857 skc->skc_slab_destroy = 0;
858 skc->skc_slab_total = 0;
859 skc->skc_slab_alloc = 0;
860 skc->skc_slab_max = 0;
861 skc->skc_obj_total = 0;
862 skc->skc_obj_alloc = 0;
863 skc->skc_obj_max = 0;
864
865 /* If none passed select a cache type based on object size */
866 if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM))) {
867 if (skc->skc_obj_size < (PAGE_SIZE / 8)) {
868 skc->skc_flags |= KMC_KMEM;
869 } else {
870 skc->skc_flags |= KMC_VMEM;
871 }
872 }
873
874 /* Size slabs properly so ensure they are not too large */
875 slab_max = ((uint64_t)1 << (MAX_ORDER - 1)) * PAGE_SIZE;
876 if (skc->skc_flags & KMC_OFFSLAB) {
877 skc->skc_slab_objs = SPL_KMEM_CACHE_OBJ_PER_SLAB;
878 skc->skc_slab_size = sizeof(spl_kmem_slab_t);
879 ASSERT(skc->skc_obj_size < slab_max);
880 } else {
881 slab_objs = SPL_KMEM_CACHE_OBJ_PER_SLAB + 1;
882
883 do {
884 slab_objs--;
885 slab_size = sizeof(spl_kmem_slab_t) + slab_objs *
886 (skc->skc_obj_size+sizeof(spl_kmem_obj_t));
887 } while (slab_size > slab_max);
888
889 skc->skc_slab_objs = slab_objs;
890 skc->skc_slab_size = slab_size;
891 }
892
893 rc = spl_magazine_create(skc);
894 if (rc) {
895 kmem_free(skc->skc_name, skc->skc_name_size);
896 kmem_free(skc, sizeof(*skc));
897 RETURN(NULL);
898 }
899
900 down_write(&spl_kmem_cache_sem);
901 list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
902 up_write(&spl_kmem_cache_sem);
903
904 RETURN(skc);
905 }
906 EXPORT_SYMBOL(spl_kmem_cache_create);
907
908 void
909 spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
910 {
911 spl_kmem_slab_t *sks, *m;
912 ENTRY;
913
914 ASSERT(skc->skc_magic == SKC_MAGIC);
915
916 down_write(&spl_kmem_cache_sem);
917 list_del_init(&skc->skc_list);
918 up_write(&spl_kmem_cache_sem);
919
920 spl_magazine_destroy(skc);
921 spin_lock(&skc->skc_lock);
922
923 /* Validate there are no objects in use and free all the
924 * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */
925 ASSERT(list_empty(&skc->skc_complete_list));
926 ASSERT(skc->skc_slab_alloc == 0);
927 ASSERT(skc->skc_obj_alloc == 0);
928
929 list_for_each_entry_safe(sks, m, &skc->skc_partial_list, sks_list)
930 spl_slab_free(sks);
931
932 ASSERT(skc->skc_slab_total == 0);
933 ASSERT(skc->skc_obj_total == 0);
934
935 kmem_free(skc->skc_name, skc->skc_name_size);
936 spin_unlock(&skc->skc_lock);
937
938 kmem_free(skc, sizeof(*skc));
939
940 EXIT;
941 }
942 EXPORT_SYMBOL(spl_kmem_cache_destroy);
943
944 static void *
945 spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
946 {
947 spl_kmem_obj_t *sko;
948
949 ASSERT(skc->skc_magic == SKC_MAGIC);
950 ASSERT(sks->sks_magic == SKS_MAGIC);
951 ASSERT(spin_is_locked(&skc->skc_lock));
952
953 sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
954 ASSERT(sko->sko_magic == SKO_MAGIC);
955 ASSERT(sko->sko_addr != NULL);
956
957 /* Remove from sks_free_list */
958 list_del_init(&sko->sko_list);
959
960 sks->sks_age = jiffies;
961 sks->sks_ref++;
962 skc->skc_obj_alloc++;
963
964 /* Track max obj usage statistics */
965 if (skc->skc_obj_alloc > skc->skc_obj_max)
966 skc->skc_obj_max = skc->skc_obj_alloc;
967
968 /* Track max slab usage statistics */
969 if (sks->sks_ref == 1) {
970 skc->skc_slab_alloc++;
971
972 if (skc->skc_slab_alloc > skc->skc_slab_max)
973 skc->skc_slab_max = skc->skc_slab_alloc;
974 }
975
976 return sko->sko_addr;
977 }
978
979 /* No available objects create a new slab. Since this is an
980 * expensive operation we do it without holding the spinlock
981 * and only briefly aquire it when we link in the fully
982 * allocated and constructed slab.
983 */
984 static spl_kmem_slab_t *
985 spl_cache_grow(spl_kmem_cache_t *skc, int flags)
986 {
987 spl_kmem_slab_t *sks;
988 ENTRY;
989
990 ASSERT(skc->skc_magic == SKC_MAGIC);
991
992 if (flags & __GFP_WAIT) {
993 flags |= __GFP_NOFAIL;
994 local_irq_enable();
995 might_sleep();
996 }
997
998 sks = spl_slab_alloc(skc, flags);
999 if (sks == NULL) {
1000 if (flags & __GFP_WAIT)
1001 local_irq_disable();
1002
1003 RETURN(NULL);
1004 }
1005
1006 if (flags & __GFP_WAIT)
1007 local_irq_disable();
1008
1009 /* Link the new empty slab in to the end of skc_partial_list */
1010 spin_lock(&skc->skc_lock);
1011 skc->skc_slab_total++;
1012 skc->skc_obj_total += sks->sks_objs;
1013 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1014 spin_unlock(&skc->skc_lock);
1015
1016 RETURN(sks);
1017 }
1018
1019 static int
1020 spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
1021 {
1022 spl_kmem_slab_t *sks;
1023 int rc = 0, refill;
1024 ENTRY;
1025
1026 ASSERT(skc->skc_magic == SKC_MAGIC);
1027 ASSERT(skm->skm_magic == SKM_MAGIC);
1028
1029 /* XXX: Check for refill bouncing by age perhaps */
1030 refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
1031
1032 spin_lock(&skc->skc_lock);
1033
1034 while (refill > 0) {
1035 /* No slabs available we must grow the cache */
1036 if (list_empty(&skc->skc_partial_list)) {
1037 spin_unlock(&skc->skc_lock);
1038
1039 sks = spl_cache_grow(skc, flags);
1040 if (!sks)
1041 GOTO(out, rc);
1042
1043 /* Rescheduled to different CPU skm is not local */
1044 if (skm != skc->skc_mag[smp_processor_id()])
1045 GOTO(out, rc);
1046
1047 /* Potentially rescheduled to the same CPU but
1048 * allocations may have occured from this CPU while
1049 * we were sleeping so recalculate max refill. */
1050 refill = MIN(refill, skm->skm_size - skm->skm_avail);
1051
1052 spin_lock(&skc->skc_lock);
1053 continue;
1054 }
1055
1056 /* Grab the next available slab */
1057 sks = list_entry((&skc->skc_partial_list)->next,
1058 spl_kmem_slab_t, sks_list);
1059 ASSERT(sks->sks_magic == SKS_MAGIC);
1060 ASSERT(sks->sks_ref < sks->sks_objs);
1061 ASSERT(!list_empty(&sks->sks_free_list));
1062
1063 /* Consume as many objects as needed to refill the requested
1064 * cache. We must also be careful not to overfill it. */
1065 while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++rc) {
1066 ASSERT(skm->skm_avail < skm->skm_size);
1067 ASSERT(rc < skm->skm_size);
1068 skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
1069 }
1070
1071 /* Move slab to skc_complete_list when full */
1072 if (sks->sks_ref == sks->sks_objs) {
1073 list_del(&sks->sks_list);
1074 list_add(&sks->sks_list, &skc->skc_complete_list);
1075 }
1076 }
1077
1078 spin_unlock(&skc->skc_lock);
1079 out:
1080 /* Returns the number of entries added to cache */
1081 RETURN(rc);
1082 }
1083
1084 static void
1085 spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
1086 {
1087 spl_kmem_slab_t *sks = NULL;
1088 spl_kmem_obj_t *sko = NULL;
1089 ENTRY;
1090
1091 ASSERT(skc->skc_magic == SKC_MAGIC);
1092 ASSERT(spin_is_locked(&skc->skc_lock));
1093
1094 sko = obj + skc->skc_obj_size;
1095 ASSERT(sko->sko_magic == SKO_MAGIC);
1096
1097 sks = sko->sko_slab;
1098 ASSERT(sks->sks_magic == SKS_MAGIC);
1099 ASSERT(sks->sks_cache == skc);
1100 list_add(&sko->sko_list, &sks->sks_free_list);
1101
1102 sks->sks_age = jiffies;
1103 sks->sks_ref--;
1104 skc->skc_obj_alloc--;
1105
1106 /* Move slab to skc_partial_list when no longer full. Slabs
1107 * are added to the head to keep the partial list is quasi-full
1108 * sorted order. Fuller at the head, emptier at the tail. */
1109 if (sks->sks_ref == (sks->sks_objs - 1)) {
1110 list_del(&sks->sks_list);
1111 list_add(&sks->sks_list, &skc->skc_partial_list);
1112 }
1113
1114 /* Move emply slabs to the end of the partial list so
1115 * they can be easily found and freed during reclamation. */
1116 if (sks->sks_ref == 0) {
1117 list_del(&sks->sks_list);
1118 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1119 skc->skc_slab_alloc--;
1120 }
1121
1122 EXIT;
1123 }
1124
1125 static int
1126 spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
1127 {
1128 int i, count = MIN(flush, skm->skm_avail);
1129 ENTRY;
1130
1131 ASSERT(skc->skc_magic == SKC_MAGIC);
1132 ASSERT(skm->skm_magic == SKM_MAGIC);
1133
1134 spin_lock(&skc->skc_lock);
1135
1136 for (i = 0; i < count; i++)
1137 spl_cache_shrink(skc, skm->skm_objs[i]);
1138
1139 // __spl_slab_reclaim(skc);
1140 skm->skm_avail -= count;
1141 memmove(skm->skm_objs, &(skm->skm_objs[count]),
1142 sizeof(void *) * skm->skm_avail);
1143
1144 spin_unlock(&skc->skc_lock);
1145
1146 RETURN(count);
1147 }
1148
1149 void *
1150 spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
1151 {
1152 spl_kmem_magazine_t *skm;
1153 unsigned long irq_flags;
1154 void *obj = NULL;
1155 int id;
1156 ENTRY;
1157
1158 ASSERT(skc->skc_magic == SKC_MAGIC);
1159 ASSERT(flags & KM_SLEEP); /* XXX: KM_NOSLEEP not yet supported */
1160 local_irq_save(irq_flags);
1161
1162 restart:
1163 /* Safe to update per-cpu structure without lock, but
1164 * in the restart case we must be careful to reaquire
1165 * the local magazine since this may have changed
1166 * when we need to grow the cache. */
1167 id = smp_processor_id();
1168 ASSERTF(id < 4, "cache=%p smp_processor_id=%d\n", skc, id);
1169 skm = skc->skc_mag[smp_processor_id()];
1170 ASSERTF(skm->skm_magic == SKM_MAGIC, "%x != %x: %s/%p/%p %x/%x/%x\n",
1171 skm->skm_magic, SKM_MAGIC, skc->skc_name, skc, skm,
1172 skm->skm_size, skm->skm_refill, skm->skm_avail);
1173
1174 if (likely(skm->skm_avail)) {
1175 /* Object available in CPU cache, use it */
1176 obj = skm->skm_objs[--skm->skm_avail];
1177 if (!(skc->skc_flags & KMC_NOTOUCH))
1178 skm->skm_age = jiffies;
1179 } else {
1180 /* Per-CPU cache empty, directly allocate from
1181 * the slab and refill the per-CPU cache. */
1182 (void)spl_cache_refill(skc, skm, flags);
1183 GOTO(restart, obj = NULL);
1184 }
1185
1186 local_irq_restore(irq_flags);
1187 ASSERT(obj);
1188
1189 /* Pre-emptively migrate object to CPU L1 cache */
1190 prefetchw(obj);
1191
1192 RETURN(obj);
1193 }
1194 EXPORT_SYMBOL(spl_kmem_cache_alloc);
1195
1196 void
1197 spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
1198 {
1199 spl_kmem_magazine_t *skm;
1200 unsigned long flags;
1201 ENTRY;
1202
1203 ASSERT(skc->skc_magic == SKC_MAGIC);
1204 local_irq_save(flags);
1205
1206 /* Safe to update per-cpu structure without lock, but
1207 * no remote memory allocation tracking is being performed
1208 * it is entirely possible to allocate an object from one
1209 * CPU cache and return it to another. */
1210 skm = skc->skc_mag[smp_processor_id()];
1211 ASSERT(skm->skm_magic == SKM_MAGIC);
1212
1213 /* Per-CPU cache full, flush it to make space */
1214 if (unlikely(skm->skm_avail >= skm->skm_size))
1215 (void)spl_cache_flush(skc, skm, skm->skm_refill);
1216
1217 /* Available space in cache, use it */
1218 skm->skm_objs[skm->skm_avail++] = obj;
1219
1220 local_irq_restore(flags);
1221
1222 EXIT;
1223 }
1224 EXPORT_SYMBOL(spl_kmem_cache_free);
1225
1226 static int
1227 spl_kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask)
1228 {
1229 spl_kmem_cache_t *skc;
1230
1231 /* Under linux a shrinker is not tightly coupled with a slab
1232 * cache. In fact linux always systematically trys calling all
1233 * registered shrinker callbacks until its target reclamation level
1234 * is reached. Because of this we only register one shrinker
1235 * function in the shim layer for all slab caches. And we always
1236 * attempt to shrink all caches when this generic shrinker is called.
1237 */
1238 down_read(&spl_kmem_cache_sem);
1239
1240 list_for_each_entry(skc, &spl_kmem_cache_list, skc_list)
1241 spl_kmem_cache_reap_now(skc);
1242
1243 up_read(&spl_kmem_cache_sem);
1244
1245 /* XXX: Under linux we should return the remaining number of
1246 * entries in the cache. We should do this as well.
1247 */
1248 return 1;
1249 }
1250
1251 void
1252 spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
1253 {
1254 spl_kmem_magazine_t *skm;
1255 int i;
1256 ENTRY;
1257
1258 ASSERT(skc->skc_magic == SKC_MAGIC);
1259
1260 if (skc->skc_reclaim)
1261 skc->skc_reclaim(skc->skc_private);
1262
1263 /* Ensure per-CPU caches which are idle gradually flush */
1264 for_each_online_cpu(i) {
1265 skm = skc->skc_mag[i];
1266
1267 if (time_after(jiffies, skm->skm_age + skc->skc_delay * HZ))
1268 (void)spl_cache_flush(skc, skm, skm->skm_refill);
1269 }
1270
1271 spl_slab_reclaim(skc);
1272
1273 EXIT;
1274 }
1275 EXPORT_SYMBOL(spl_kmem_cache_reap_now);
1276
1277 void
1278 spl_kmem_reap(void)
1279 {
1280 spl_kmem_cache_generic_shrinker(KMC_REAP_CHUNK, GFP_KERNEL);
1281 }
1282 EXPORT_SYMBOL(spl_kmem_reap);
1283
1284 #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
1285 static char *
1286 spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
1287 {
1288 int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
1289 int i, flag = 1;
1290
1291 ASSERT(str != NULL && len >= 17);
1292 memset(str, 0, len);
1293
1294 /* Check for a fully printable string, and while we are at
1295 * it place the printable characters in the passed buffer. */
1296 for (i = 0; i < size; i++) {
1297 str[i] = ((char *)(kd->kd_addr))[i];
1298 if (isprint(str[i])) {
1299 continue;
1300 } else {
1301 /* Minimum number of printable characters found
1302 * to make it worthwhile to print this as ascii. */
1303 if (i > min)
1304 break;
1305
1306 flag = 0;
1307 break;
1308 }
1309 }
1310
1311 if (!flag) {
1312 sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
1313 *((uint8_t *)kd->kd_addr),
1314 *((uint8_t *)kd->kd_addr + 2),
1315 *((uint8_t *)kd->kd_addr + 4),
1316 *((uint8_t *)kd->kd_addr + 6),
1317 *((uint8_t *)kd->kd_addr + 8),
1318 *((uint8_t *)kd->kd_addr + 10),
1319 *((uint8_t *)kd->kd_addr + 12),
1320 *((uint8_t *)kd->kd_addr + 14));
1321 }
1322
1323 return str;
1324 }
1325
1326 static int
1327 spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
1328 {
1329 int i;
1330 ENTRY;
1331
1332 spin_lock_init(lock);
1333 INIT_LIST_HEAD(list);
1334
1335 for (i = 0; i < size; i++)
1336 INIT_HLIST_HEAD(&kmem_table[i]);
1337
1338 RETURN(0);
1339 }
1340
1341 static void
1342 spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
1343 {
1344 unsigned long flags;
1345 kmem_debug_t *kd;
1346 char str[17];
1347 ENTRY;
1348
1349 spin_lock_irqsave(lock, flags);
1350 if (!list_empty(list))
1351 printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
1352 "size", "data", "func", "line");
1353
1354 list_for_each_entry(kd, list, kd_list)
1355 printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
1356 kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
1357 kd->kd_func, kd->kd_line);
1358
1359 spin_unlock_irqrestore(lock, flags);
1360 EXIT;
1361 }
1362 #else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
1363 #define spl_kmem_init_tracking(list, lock, size)
1364 #define spl_kmem_fini_tracking(list, lock)
1365 #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
1366
1367 int
1368 spl_kmem_init(void)
1369 {
1370 int rc = 0;
1371 ENTRY;
1372
1373 init_rwsem(&spl_kmem_cache_sem);
1374 INIT_LIST_HEAD(&spl_kmem_cache_list);
1375
1376 #ifdef HAVE_SET_SHRINKER
1377 spl_kmem_cache_shrinker = set_shrinker(KMC_DEFAULT_SEEKS,
1378 spl_kmem_cache_generic_shrinker);
1379 if (spl_kmem_cache_shrinker == NULL)
1380 RETURN(rc = -ENOMEM);
1381 #else
1382 register_shrinker(&spl_kmem_cache_shrinker);
1383 #endif
1384
1385 #ifdef DEBUG_KMEM
1386 atomic64_set(&kmem_alloc_used, 0);
1387 atomic64_set(&vmem_alloc_used, 0);
1388
1389 spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
1390 spl_kmem_init_tracking(&vmem_list, &vmem_lock, VMEM_TABLE_SIZE);
1391 #endif
1392 RETURN(rc);
1393 }
1394
1395 void
1396 spl_kmem_fini(void)
1397 {
1398 #ifdef DEBUG_KMEM
1399 /* Display all unreclaimed memory addresses, including the
1400 * allocation size and the first few bytes of what's located
1401 * at that address to aid in debugging. Performance is not
1402 * a serious concern here since it is module unload time. */
1403 if (atomic64_read(&kmem_alloc_used) != 0)
1404 CWARN("kmem leaked %ld/%ld bytes\n",
1405 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
1406
1407
1408 if (atomic64_read(&vmem_alloc_used) != 0)
1409 CWARN("vmem leaked %ld/%ld bytes\n",
1410 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
1411
1412 spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
1413 spl_kmem_fini_tracking(&vmem_list, &vmem_lock);
1414 #endif /* DEBUG_KMEM */
1415 ENTRY;
1416
1417 #ifdef HAVE_SET_SHRINKER
1418 remove_shrinker(spl_kmem_cache_shrinker);
1419 #else
1420 unregister_shrinker(&spl_kmem_cache_shrinker);
1421 #endif
1422
1423 EXIT;
1424 }