]> git.proxmox.com Git - mirror_spl-debian.git/blob - module/spl/spl-kmem.c
Remove debug check was was accidentally left in place an prevent the slab cache from...
[mirror_spl-debian.git] / module / spl / spl-kmem.c
1 /*
2 * This file is part of the SPL: Solaris Porting Layer.
3 *
4 * Copyright (c) 2008 Lawrence Livermore National Security, LLC.
5 * Produced at Lawrence Livermore National Laboratory
6 * Written by:
7 * Brian Behlendorf <behlendorf1@llnl.gov>,
8 * Herb Wartens <wartens2@llnl.gov>,
9 * Jim Garlick <garlick@llnl.gov>
10 * UCRL-CODE-235197
11 *
12 * This is free software; you can redistribute it and/or modify it
13 * under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This is distributed in the hope that it will be useful, but WITHOUT
18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
20 * for more details.
21 *
22 * You should have received a copy of the GNU General Public License along
23 * with this program; if not, write to the Free Software Foundation, Inc.,
24 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 */
26
27 #include <sys/kmem.h>
28
29 #ifdef DEBUG_SUBSYSTEM
30 # undef DEBUG_SUBSYSTEM
31 #endif
32
33 #define DEBUG_SUBSYSTEM S_KMEM
34
35 /*
36 * Memory allocation interfaces and debugging for basic kmem_*
37 * and vmem_* style memory allocation. When DEBUG_KMEM is enable
38 * all allocations will be tracked when they are allocated and
39 * freed. When the SPL module is unload a list of all leaked
40 * addresses and where they were allocated will be dumped to the
41 * console. Enabling this feature has a significant impant on
42 * performance but it makes finding memory leaks staight forward.
43 */
44 #ifdef DEBUG_KMEM
45 /* Shim layer memory accounting */
46 atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
47 unsigned long long kmem_alloc_max = 0;
48 atomic64_t vmem_alloc_used = ATOMIC64_INIT(0);
49 unsigned long long vmem_alloc_max = 0;
50 int kmem_warning_flag = 1;
51
52 EXPORT_SYMBOL(kmem_alloc_used);
53 EXPORT_SYMBOL(kmem_alloc_max);
54 EXPORT_SYMBOL(vmem_alloc_used);
55 EXPORT_SYMBOL(vmem_alloc_max);
56 EXPORT_SYMBOL(kmem_warning_flag);
57
58 # ifdef DEBUG_KMEM_TRACKING
59
60 /* XXX - Not to surprisingly with debugging enabled the xmem_locks are very
61 * highly contended particularly on xfree(). If we want to run with this
62 * detailed debugging enabled for anything other than debugging we need to
63 * minimize the contention by moving to a lock per xmem_table entry model.
64 */
65
66 # define KMEM_HASH_BITS 10
67 # define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS)
68
69 # define VMEM_HASH_BITS 10
70 # define VMEM_TABLE_SIZE (1 << VMEM_HASH_BITS)
71
72 typedef struct kmem_debug {
73 struct hlist_node kd_hlist; /* Hash node linkage */
74 struct list_head kd_list; /* List of all allocations */
75 void *kd_addr; /* Allocation pointer */
76 size_t kd_size; /* Allocation size */
77 const char *kd_func; /* Allocation function */
78 int kd_line; /* Allocation line */
79 } kmem_debug_t;
80
81 spinlock_t kmem_lock;
82 struct hlist_head kmem_table[KMEM_TABLE_SIZE];
83 struct list_head kmem_list;
84
85 spinlock_t vmem_lock;
86 struct hlist_head vmem_table[VMEM_TABLE_SIZE];
87 struct list_head vmem_list;
88
89 EXPORT_SYMBOL(kmem_lock);
90 EXPORT_SYMBOL(kmem_table);
91 EXPORT_SYMBOL(kmem_list);
92
93 EXPORT_SYMBOL(vmem_lock);
94 EXPORT_SYMBOL(vmem_table);
95 EXPORT_SYMBOL(vmem_list);
96 # endif
97
98 int kmem_set_warning(int flag) { return (kmem_warning_flag = !!flag); }
99 #else
100 int kmem_set_warning(int flag) { return 0; }
101 #endif
102 EXPORT_SYMBOL(kmem_set_warning);
103
104 /*
105 * Slab allocation interfaces
106 *
107 * While the Linux slab implementation was inspired by the Solaris
108 * implemenation I cannot use it to emulate the Solaris APIs. I
109 * require two features which are not provided by the Linux slab.
110 *
111 * 1) Constructors AND destructors. Recent versions of the Linux
112 * kernel have removed support for destructors. This is a deal
113 * breaker for the SPL which contains particularly expensive
114 * initializers for mutex's, condition variables, etc. We also
115 * require a minimal level of cleanup for these data types unlike
116 * many Linux data type which do need to be explicitly destroyed.
117 *
118 * 2) Virtual address space backed slab. Callers of the Solaris slab
119 * expect it to work well for both small are very large allocations.
120 * Because of memory fragmentation the Linux slab which is backed
121 * by kmalloc'ed memory performs very badly when confronted with
122 * large numbers of large allocations. Basing the slab on the
123 * virtual address space removes the need for contigeous pages
124 * and greatly improve performance for large allocations.
125 *
126 * For these reasons, the SPL has its own slab implementation with
127 * the needed features. It is not as highly optimized as either the
128 * Solaris or Linux slabs, but it should get me most of what is
129 * needed until it can be optimized or obsoleted by another approach.
130 *
131 * One serious concern I do have about this method is the relatively
132 * small virtual address space on 32bit arches. This will seriously
133 * constrain the size of the slab caches and their performance.
134 *
135 * XXX: Implement work requests to keep an eye on each cache and
136 * shrink them via spl_slab_reclaim() when they are wasting lots
137 * of space. Currently this process is driven by the reapers.
138 *
139 * XXX: Improve the partial slab list by carefully maintaining a
140 * strict ordering of fullest to emptiest slabs based on
141 * the slab reference count. This gaurentees the when freeing
142 * slabs back to the system we need only linearly traverse the
143 * last N slabs in the list to discover all the freeable slabs.
144 *
145 * XXX: NUMA awareness for optionally allocating memory close to a
146 * particular core. This can be adventageous if you know the slab
147 * object will be short lived and primarily accessed from one core.
148 *
149 * XXX: Slab coloring may also yield performance improvements and would
150 * be desirable to implement.
151 */
152
153 struct list_head spl_kmem_cache_list; /* List of caches */
154 struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
155
156 static int spl_cache_flush(spl_kmem_cache_t *skc,
157 spl_kmem_magazine_t *skm, int flush);
158
159 #ifdef HAVE_SET_SHRINKER
160 static struct shrinker *spl_kmem_cache_shrinker;
161 #else
162 static int spl_kmem_cache_generic_shrinker(int nr_to_scan,
163 unsigned int gfp_mask);
164 static struct shrinker spl_kmem_cache_shrinker = {
165 .shrink = spl_kmem_cache_generic_shrinker,
166 .seeks = KMC_DEFAULT_SEEKS,
167 };
168 #endif
169
170 #ifdef DEBUG_KMEM
171 # ifdef DEBUG_KMEM_TRACKING
172
173 static kmem_debug_t *
174 kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits,
175 void *addr)
176 {
177 struct hlist_head *head;
178 struct hlist_node *node;
179 struct kmem_debug *p;
180 unsigned long flags;
181 ENTRY;
182
183 spin_lock_irqsave(lock, flags);
184
185 head = &table[hash_ptr(addr, bits)];
186 hlist_for_each_entry_rcu(p, node, head, kd_hlist) {
187 if (p->kd_addr == addr) {
188 hlist_del_init(&p->kd_hlist);
189 list_del_init(&p->kd_list);
190 spin_unlock_irqrestore(lock, flags);
191 return p;
192 }
193 }
194
195 spin_unlock_irqrestore(lock, flags);
196
197 RETURN(NULL);
198 }
199
200 void *
201 kmem_alloc_track(size_t size, int flags, const char *func, int line,
202 int node_alloc, int node)
203 {
204 void *ptr = NULL;
205 kmem_debug_t *dptr;
206 unsigned long irq_flags;
207 ENTRY;
208
209 dptr = (kmem_debug_t *) kmalloc(sizeof(kmem_debug_t),
210 flags & ~__GFP_ZERO);
211
212 if (dptr == NULL) {
213 CWARN("kmem_alloc(%ld, 0x%x) debug failed\n",
214 sizeof(kmem_debug_t), flags);
215 } else {
216 /* Marked unlikely because we should never be doing this,
217 * we tolerate to up 2 pages but a single page is best. */
218 if (unlikely((size) > (PAGE_SIZE * 2)) && kmem_warning_flag)
219 CWARN("Large kmem_alloc(%llu, 0x%x) (%lld/%llu)\n",
220 (unsigned long long) size, flags,
221 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
222
223 /* We use kstrdup() below because the string pointed to by
224 * __FUNCTION__ might not be available by the time we want
225 * to print it since the module might have been unloaded. */
226 dptr->kd_func = kstrdup(func, flags & ~__GFP_ZERO);
227 if (unlikely(dptr->kd_func == NULL)) {
228 kfree(dptr);
229 CWARN("kstrdup() failed in kmem_alloc(%llu, 0x%x) "
230 "(%lld/%llu)\n", (unsigned long long) size, flags,
231 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
232 goto out;
233 }
234
235 /* Use the correct allocator */
236 if (node_alloc) {
237 ASSERT(!(flags & __GFP_ZERO));
238 ptr = kmalloc_node(size, flags, node);
239 } else if (flags & __GFP_ZERO) {
240 ptr = kzalloc(size, flags & ~__GFP_ZERO);
241 } else {
242 ptr = kmalloc(size, flags);
243 }
244
245 if (unlikely(ptr == NULL)) {
246 kfree(dptr->kd_func);
247 kfree(dptr);
248 CWARN("kmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
249 (unsigned long long) size, flags,
250 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
251 goto out;
252 }
253
254 atomic64_add(size, &kmem_alloc_used);
255 if (unlikely(atomic64_read(&kmem_alloc_used) >
256 kmem_alloc_max))
257 kmem_alloc_max =
258 atomic64_read(&kmem_alloc_used);
259
260 INIT_HLIST_NODE(&dptr->kd_hlist);
261 INIT_LIST_HEAD(&dptr->kd_list);
262
263 dptr->kd_addr = ptr;
264 dptr->kd_size = size;
265 dptr->kd_line = line;
266
267 spin_lock_irqsave(&kmem_lock, irq_flags);
268 hlist_add_head_rcu(&dptr->kd_hlist,
269 &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
270 list_add_tail(&dptr->kd_list, &kmem_list);
271 spin_unlock_irqrestore(&kmem_lock, irq_flags);
272
273 CDEBUG_LIMIT(D_INFO, "kmem_alloc(%llu, 0x%x) = %p "
274 "(%lld/%llu)\n", (unsigned long long) size, flags,
275 ptr, atomic64_read(&kmem_alloc_used),
276 kmem_alloc_max);
277 }
278 out:
279 RETURN(ptr);
280 }
281 EXPORT_SYMBOL(kmem_alloc_track);
282
283 void
284 kmem_free_track(void *ptr, size_t size)
285 {
286 kmem_debug_t *dptr;
287 ENTRY;
288
289 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
290 (unsigned long long) size);
291
292 dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
293
294 ASSERT(dptr); /* Must exist in hash due to kmem_alloc() */
295
296 /* Size must match */
297 ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
298 "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
299 (unsigned long long) size, dptr->kd_func, dptr->kd_line);
300
301 atomic64_sub(size, &kmem_alloc_used);
302
303 CDEBUG_LIMIT(D_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
304 (unsigned long long) size, atomic64_read(&kmem_alloc_used),
305 kmem_alloc_max);
306
307 kfree(dptr->kd_func);
308
309 memset(dptr, 0x5a, sizeof(kmem_debug_t));
310 kfree(dptr);
311
312 memset(ptr, 0x5a, size);
313 kfree(ptr);
314
315 EXIT;
316 }
317 EXPORT_SYMBOL(kmem_free_track);
318
319 void *
320 vmem_alloc_track(size_t size, int flags, const char *func, int line)
321 {
322 void *ptr = NULL;
323 kmem_debug_t *dptr;
324 unsigned long irq_flags;
325 ENTRY;
326
327 ASSERT(flags & KM_SLEEP);
328
329 dptr = (kmem_debug_t *) kmalloc(sizeof(kmem_debug_t), flags);
330 if (dptr == NULL) {
331 CWARN("vmem_alloc(%ld, 0x%x) debug failed\n",
332 sizeof(kmem_debug_t), flags);
333 } else {
334 /* We use kstrdup() below because the string pointed to by
335 * __FUNCTION__ might not be available by the time we want
336 * to print it, since the module might have been unloaded. */
337 dptr->kd_func = kstrdup(func, flags & ~__GFP_ZERO);
338 if (unlikely(dptr->kd_func == NULL)) {
339 kfree(dptr);
340 CWARN("kstrdup() failed in vmem_alloc(%llu, 0x%x) "
341 "(%lld/%llu)\n", (unsigned long long) size, flags,
342 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
343 goto out;
344 }
345
346 ptr = __vmalloc(size, (flags | __GFP_HIGHMEM) & ~__GFP_ZERO,
347 PAGE_KERNEL);
348
349 if (unlikely(ptr == NULL)) {
350 kfree(dptr->kd_func);
351 kfree(dptr);
352 CWARN("vmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
353 (unsigned long long) size, flags,
354 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
355 goto out;
356 }
357
358 if (flags & __GFP_ZERO)
359 memset(ptr, 0, size);
360
361 atomic64_add(size, &vmem_alloc_used);
362 if (unlikely(atomic64_read(&vmem_alloc_used) >
363 vmem_alloc_max))
364 vmem_alloc_max =
365 atomic64_read(&vmem_alloc_used);
366
367 INIT_HLIST_NODE(&dptr->kd_hlist);
368 INIT_LIST_HEAD(&dptr->kd_list);
369
370 dptr->kd_addr = ptr;
371 dptr->kd_size = size;
372 dptr->kd_line = line;
373
374 spin_lock_irqsave(&vmem_lock, irq_flags);
375 hlist_add_head_rcu(&dptr->kd_hlist,
376 &vmem_table[hash_ptr(ptr, VMEM_HASH_BITS)]);
377 list_add_tail(&dptr->kd_list, &vmem_list);
378 spin_unlock_irqrestore(&vmem_lock, irq_flags);
379
380 CDEBUG_LIMIT(D_INFO, "vmem_alloc(%llu, 0x%x) = %p "
381 "(%lld/%llu)\n", (unsigned long long) size, flags,
382 ptr, atomic64_read(&vmem_alloc_used),
383 vmem_alloc_max);
384 }
385 out:
386 RETURN(ptr);
387 }
388 EXPORT_SYMBOL(vmem_alloc_track);
389
390 void
391 vmem_free_track(void *ptr, size_t size)
392 {
393 kmem_debug_t *dptr;
394 ENTRY;
395
396 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
397 (unsigned long long) size);
398
399 dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr);
400 ASSERT(dptr); /* Must exist in hash due to vmem_alloc() */
401
402 /* Size must match */
403 ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
404 "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
405 (unsigned long long) size, dptr->kd_func, dptr->kd_line);
406
407 atomic64_sub(size, &vmem_alloc_used);
408 CDEBUG_LIMIT(D_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
409 (unsigned long long) size, atomic64_read(&vmem_alloc_used),
410 vmem_alloc_max);
411
412 kfree(dptr->kd_func);
413
414 memset(dptr, 0x5a, sizeof(kmem_debug_t));
415 kfree(dptr);
416
417 memset(ptr, 0x5a, size);
418 vfree(ptr);
419
420 EXIT;
421 }
422 EXPORT_SYMBOL(vmem_free_track);
423
424 # else /* DEBUG_KMEM_TRACKING */
425
426 void *
427 kmem_alloc_debug(size_t size, int flags, const char *func, int line,
428 int node_alloc, int node)
429 {
430 void *ptr;
431 ENTRY;
432
433 /* Marked unlikely because we should never be doing this,
434 * we tolerate to up 2 pages but a single page is best. */
435 if (unlikely(size > (PAGE_SIZE * 2)) && kmem_warning_flag)
436 CWARN("Large kmem_alloc(%llu, 0x%x) (%lld/%llu)\n",
437 (unsigned long long) size, flags,
438 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
439
440 /* Use the correct allocator */
441 if (node_alloc) {
442 ASSERT(!(flags & __GFP_ZERO));
443 ptr = kmalloc_node(size, flags, node);
444 } else if (flags & __GFP_ZERO) {
445 ptr = kzalloc(size, flags & (~__GFP_ZERO));
446 } else {
447 ptr = kmalloc(size, flags);
448 }
449
450 if (ptr == NULL) {
451 CWARN("kmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
452 (unsigned long long) size, flags,
453 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
454 } else {
455 atomic64_add(size, &kmem_alloc_used);
456 if (unlikely(atomic64_read(&kmem_alloc_used) > kmem_alloc_max))
457 kmem_alloc_max = atomic64_read(&kmem_alloc_used);
458
459 CDEBUG_LIMIT(D_INFO, "kmem_alloc(%llu, 0x%x) = %p "
460 "(%lld/%llu)\n", (unsigned long long) size, flags, ptr,
461 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
462 }
463 RETURN(ptr);
464 }
465 EXPORT_SYMBOL(kmem_alloc_debug);
466
467 void
468 kmem_free_debug(void *ptr, size_t size)
469 {
470 ENTRY;
471
472 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
473 (unsigned long long) size);
474
475 atomic64_sub(size, &kmem_alloc_used);
476
477 CDEBUG_LIMIT(D_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
478 (unsigned long long) size, atomic64_read(&kmem_alloc_used),
479 kmem_alloc_max);
480
481 memset(ptr, 0x5a, size);
482 kfree(ptr);
483
484 EXIT;
485 }
486 EXPORT_SYMBOL(kmem_free_debug);
487
488 void *
489 vmem_alloc_debug(size_t size, int flags, const char *func, int line)
490 {
491 void *ptr;
492 ENTRY;
493
494 ASSERT(flags & KM_SLEEP);
495
496 ptr = __vmalloc(size, (flags | __GFP_HIGHMEM) & ~__GFP_ZERO,
497 PAGE_KERNEL);
498 if (ptr == NULL) {
499 CWARN("vmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
500 (unsigned long long) size, flags,
501 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
502 } else {
503 if (flags & __GFP_ZERO)
504 memset(ptr, 0, size);
505
506 atomic64_add(size, &vmem_alloc_used);
507
508 if (unlikely(atomic64_read(&vmem_alloc_used) > vmem_alloc_max))
509 vmem_alloc_max = atomic64_read(&vmem_alloc_used);
510
511 CDEBUG_LIMIT(D_INFO, "vmem_alloc(%llu, 0x%x) = %p "
512 "(%lld/%llu)\n", (unsigned long long) size, flags, ptr,
513 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
514 }
515
516 RETURN(ptr);
517 }
518 EXPORT_SYMBOL(vmem_alloc_debug);
519
520 void
521 vmem_free_debug(void *ptr, size_t size)
522 {
523 ENTRY;
524
525 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
526 (unsigned long long) size);
527
528 atomic64_sub(size, &vmem_alloc_used);
529
530 CDEBUG_LIMIT(D_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
531 (unsigned long long) size, atomic64_read(&vmem_alloc_used),
532 vmem_alloc_max);
533
534 memset(ptr, 0x5a, size);
535 vfree(ptr);
536
537 EXIT;
538 }
539 EXPORT_SYMBOL(vmem_free_debug);
540
541 # endif /* DEBUG_KMEM_TRACKING */
542 #endif /* DEBUG_KMEM */
543
544 static void *
545 kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
546 {
547 void *ptr;
548
549 if (skc->skc_flags & KMC_KMEM) {
550 if (size > (2 * PAGE_SIZE)) {
551 ptr = (void *)__get_free_pages(flags, get_order(size));
552 } else
553 ptr = kmem_alloc(size, flags);
554 } else {
555 ptr = vmem_alloc(size, flags);
556 }
557
558 return ptr;
559 }
560
561 static void
562 kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
563 {
564 if (skc->skc_flags & KMC_KMEM) {
565 if (size > (2 * PAGE_SIZE))
566 free_pages((unsigned long)ptr, get_order(size));
567 else
568 kmem_free(ptr, size);
569 } else {
570 vmem_free(ptr, size);
571 }
572 }
573
574 /* It's important that we pack the spl_kmem_obj_t structure and the
575 * actual objects in to one large address space to minimize the number
576 * of calls to the allocator. It is far better to do a few large
577 * allocations and then subdivide it ourselves. Now which allocator
578 * we use requires balancing a few trade offs.
579 *
580 * For small objects we use kmem_alloc() because as long as you are
581 * only requesting a small number of pages (ideally just one) its cheap.
582 * However, when you start requesting multiple pages with kmem_alloc()
583 * it gets increasingly expensive since it requires contigeous pages.
584 * For this reason we shift to vmem_alloc() for slabs of large objects
585 * which removes the need for contigeous pages. We do not use
586 * vmem_alloc() in all cases because there is significant locking
587 * overhead in __get_vm_area_node(). This function takes a single
588 * global lock when aquiring an available virtual address range which
589 * serializes all vmem_alloc()'s for all slab caches. Using slightly
590 * different allocation functions for small and large objects should
591 * give us the best of both worlds.
592 *
593 * KMC_ONSLAB KMC_OFFSLAB
594 *
595 * +------------------------+ +-----------------+
596 * | spl_kmem_slab_t --+-+ | | spl_kmem_slab_t |---+-+
597 * | skc_obj_size <-+ | | +-----------------+ | |
598 * | spl_kmem_obj_t | | | |
599 * | skc_obj_size <---+ | +-----------------+ | |
600 * | spl_kmem_obj_t | | | skc_obj_size | <-+ |
601 * | ... v | | spl_kmem_obj_t | |
602 * +------------------------+ +-----------------+ v
603 */
604 static spl_kmem_slab_t *
605 spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
606 {
607 spl_kmem_slab_t *sks;
608 spl_kmem_obj_t *sko, *n;
609 void *base, *obj;
610 int i, align, size, rc = 0;
611
612 base = kv_alloc(skc, skc->skc_slab_size, flags);
613 if (base == NULL)
614 RETURN(NULL);
615
616 sks = (spl_kmem_slab_t *)base;
617 sks->sks_magic = SKS_MAGIC;
618 sks->sks_objs = skc->skc_slab_objs;
619 sks->sks_age = jiffies;
620 sks->sks_cache = skc;
621 INIT_LIST_HEAD(&sks->sks_list);
622 INIT_LIST_HEAD(&sks->sks_free_list);
623 sks->sks_ref = 0;
624
625 align = skc->skc_obj_align;
626 size = P2ROUNDUP(skc->skc_obj_size, align) +
627 P2ROUNDUP(sizeof(spl_kmem_obj_t), align);
628
629 for (i = 0; i < sks->sks_objs; i++) {
630 if (skc->skc_flags & KMC_OFFSLAB) {
631 obj = kv_alloc(skc, size, flags);
632 if (!obj)
633 GOTO(out, rc = -ENOMEM);
634 } else {
635 obj = base +
636 P2ROUNDUP(sizeof(spl_kmem_slab_t), align) +
637 (i * size);
638 }
639
640 sko = obj + P2ROUNDUP(skc->skc_obj_size, align);
641 sko->sko_addr = obj;
642 sko->sko_magic = SKO_MAGIC;
643 sko->sko_slab = sks;
644 INIT_LIST_HEAD(&sko->sko_list);
645 list_add_tail(&sko->sko_list, &sks->sks_free_list);
646 }
647
648 list_for_each_entry(sko, &sks->sks_free_list, sko_list)
649 if (skc->skc_ctor)
650 skc->skc_ctor(sko->sko_addr, skc->skc_private, flags);
651 out:
652 if (rc) {
653 if (skc->skc_flags & KMC_OFFSLAB)
654 list_for_each_entry_safe(sko, n, &sks->sks_free_list,
655 sko_list)
656 kv_free(skc, sko->sko_addr, size);
657
658 kv_free(skc, base, skc->skc_slab_size);
659 sks = NULL;
660 }
661
662 RETURN(sks);
663 }
664
665 /* Removes slab from complete or partial list, so it must
666 * be called with the 'skc->skc_lock' held.
667 */
668 static void
669 spl_slab_free(spl_kmem_slab_t *sks) {
670 spl_kmem_cache_t *skc;
671 spl_kmem_obj_t *sko, *n;
672 int size;
673 ENTRY;
674
675 ASSERT(sks->sks_magic == SKS_MAGIC);
676 ASSERT(sks->sks_ref == 0);
677
678 skc = sks->sks_cache;
679 ASSERT(skc->skc_magic == SKC_MAGIC);
680 ASSERT(spin_is_locked(&skc->skc_lock));
681
682 skc->skc_obj_total -= sks->sks_objs;
683 skc->skc_slab_total--;
684 list_del(&sks->sks_list);
685 size = P2ROUNDUP(skc->skc_obj_size, skc->skc_obj_align) +
686 P2ROUNDUP(sizeof(spl_kmem_obj_t), skc->skc_obj_align);
687
688 /* Run destructors slab is being released */
689 list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) {
690 ASSERT(sko->sko_magic == SKO_MAGIC);
691
692 if (skc->skc_dtor)
693 skc->skc_dtor(sko->sko_addr, skc->skc_private);
694
695 if (skc->skc_flags & KMC_OFFSLAB)
696 kv_free(skc, sko->sko_addr, size);
697 }
698
699 kv_free(skc, sks, skc->skc_slab_size);
700 EXIT;
701 }
702
703 static int
704 __spl_slab_reclaim(spl_kmem_cache_t *skc)
705 {
706 spl_kmem_slab_t *sks, *m;
707 int rc = 0;
708 ENTRY;
709
710 ASSERT(spin_is_locked(&skc->skc_lock));
711 /*
712 * Free empty slabs which have not been touched in skc_delay
713 * seconds. This delay time is important to avoid thrashing.
714 * Empty slabs will be at the end of the skc_partial_list.
715 */
716 list_for_each_entry_safe_reverse(sks, m, &skc->skc_partial_list,
717 sks_list) {
718 if (sks->sks_ref > 0)
719 break;
720
721 if (time_after(jiffies, sks->sks_age + skc->skc_delay * HZ)) {
722 spl_slab_free(sks);
723 rc++;
724 }
725 }
726
727 /* Returns number of slabs reclaimed */
728 RETURN(rc);
729 }
730
731 static int
732 spl_slab_reclaim(spl_kmem_cache_t *skc)
733 {
734 int rc;
735 ENTRY;
736
737 spin_lock(&skc->skc_lock);
738 rc = __spl_slab_reclaim(skc);
739 spin_unlock(&skc->skc_lock);
740
741 RETURN(rc);
742 }
743
744 /* Size slabs properly to ensure they are not too large */
745 static int
746 spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
747 {
748 int max = ((uint64_t)1 << (MAX_ORDER - 1)) * PAGE_SIZE;
749 int align = skc->skc_obj_align;
750
751 *objs = SPL_KMEM_CACHE_OBJ_PER_SLAB;
752
753 if (skc->skc_flags & KMC_OFFSLAB) {
754 *size = sizeof(spl_kmem_slab_t);
755 } else {
756 resize:
757 *size = P2ROUNDUP(sizeof(spl_kmem_slab_t), align) +
758 *objs * (P2ROUNDUP(skc->skc_obj_size, align) +
759 P2ROUNDUP(sizeof(spl_kmem_obj_t), align));
760
761 if (*size > max)
762 GOTO(resize, *objs = *objs - 1);
763
764 ASSERT(*objs > 0);
765 }
766
767 ASSERTF(*size <= max, "%d < %d\n", *size, max);
768 RETURN(0);
769 }
770
771 static int
772 spl_magazine_size(spl_kmem_cache_t *skc)
773 {
774 int size, align = skc->skc_obj_align;
775 ENTRY;
776
777 /* Guesses for reasonable magazine sizes, they
778 * should really adapt based on observed usage. */
779 if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE * 256))
780 size = 4;
781 else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE * 32))
782 size = 16;
783 else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE))
784 size = 64;
785 else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE / 4))
786 size = 128;
787 else
788 size = 512;
789
790 RETURN(size);
791 }
792
793 static spl_kmem_magazine_t *
794 spl_magazine_alloc(spl_kmem_cache_t *skc, int node)
795 {
796 spl_kmem_magazine_t *skm;
797 int size = sizeof(spl_kmem_magazine_t) +
798 sizeof(void *) * skc->skc_mag_size;
799 ENTRY;
800
801 skm = kmem_alloc_node(size, GFP_KERNEL, node);
802 if (skm) {
803 skm->skm_magic = SKM_MAGIC;
804 skm->skm_avail = 0;
805 skm->skm_size = skc->skc_mag_size;
806 skm->skm_refill = skc->skc_mag_refill;
807 if (!(skc->skc_flags & KMC_NOTOUCH))
808 skm->skm_age = jiffies;
809 }
810
811 RETURN(skm);
812 }
813
814 static void
815 spl_magazine_free(spl_kmem_magazine_t *skm)
816 {
817 int size = sizeof(spl_kmem_magazine_t) +
818 sizeof(void *) * skm->skm_size;
819
820 ENTRY;
821 ASSERT(skm->skm_magic == SKM_MAGIC);
822 ASSERT(skm->skm_avail == 0);
823
824 kmem_free(skm, size);
825 EXIT;
826 }
827
828 static int
829 spl_magazine_create(spl_kmem_cache_t *skc)
830 {
831 int i;
832 ENTRY;
833
834 skc->skc_mag_size = spl_magazine_size(skc);
835 skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
836
837 for_each_online_cpu(i) {
838 skc->skc_mag[i] = spl_magazine_alloc(skc, cpu_to_node(i));
839 if (!skc->skc_mag[i]) {
840 for (i--; i >= 0; i--)
841 spl_magazine_free(skc->skc_mag[i]);
842
843 RETURN(-ENOMEM);
844 }
845 }
846
847 RETURN(0);
848 }
849
850 static void
851 spl_magazine_destroy(spl_kmem_cache_t *skc)
852 {
853 spl_kmem_magazine_t *skm;
854 int i;
855 ENTRY;
856
857 for_each_online_cpu(i) {
858 skm = skc->skc_mag[i];
859 (void)spl_cache_flush(skc, skm, skm->skm_avail);
860 spl_magazine_free(skm);
861 }
862
863 EXIT;
864 }
865
866 spl_kmem_cache_t *
867 spl_kmem_cache_create(char *name, size_t size, size_t align,
868 spl_kmem_ctor_t ctor,
869 spl_kmem_dtor_t dtor,
870 spl_kmem_reclaim_t reclaim,
871 void *priv, void *vmp, int flags)
872 {
873 spl_kmem_cache_t *skc;
874 int rc, kmem_flags = KM_SLEEP;
875 ENTRY;
876
877 ASSERTF(!(flags & KMC_NOMAGAZINE), "Bad KMC_NOMAGAZINE (%x)\n", flags);
878 ASSERTF(!(flags & KMC_NOHASH), "Bad KMC_NOHASH (%x)\n", flags);
879 ASSERTF(!(flags & KMC_QCACHE), "Bad KMC_QCACHE (%x)\n", flags);
880 ASSERT(vmp == NULL);
881
882 /* We may be called when there is a non-zero preempt_count or
883 * interrupts are disabled is which case we must not sleep.
884 */
885 if (current_thread_info()->preempt_count || irqs_disabled())
886 kmem_flags = KM_NOSLEEP;
887
888 /* Allocate new cache memory and initialize. */
889 skc = (spl_kmem_cache_t *)kmem_zalloc(sizeof(*skc), kmem_flags);
890 if (skc == NULL)
891 RETURN(NULL);
892
893 skc->skc_magic = SKC_MAGIC;
894 skc->skc_name_size = strlen(name) + 1;
895 skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, kmem_flags);
896 if (skc->skc_name == NULL) {
897 kmem_free(skc, sizeof(*skc));
898 RETURN(NULL);
899 }
900 strncpy(skc->skc_name, name, skc->skc_name_size);
901
902 skc->skc_ctor = ctor;
903 skc->skc_dtor = dtor;
904 skc->skc_reclaim = reclaim;
905 skc->skc_private = priv;
906 skc->skc_vmp = vmp;
907 skc->skc_flags = flags;
908 skc->skc_obj_size = size;
909 skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
910 skc->skc_delay = SPL_KMEM_CACHE_DELAY;
911
912 INIT_LIST_HEAD(&skc->skc_list);
913 INIT_LIST_HEAD(&skc->skc_complete_list);
914 INIT_LIST_HEAD(&skc->skc_partial_list);
915 spin_lock_init(&skc->skc_lock);
916 skc->skc_slab_fail = 0;
917 skc->skc_slab_create = 0;
918 skc->skc_slab_destroy = 0;
919 skc->skc_slab_total = 0;
920 skc->skc_slab_alloc = 0;
921 skc->skc_slab_max = 0;
922 skc->skc_obj_total = 0;
923 skc->skc_obj_alloc = 0;
924 skc->skc_obj_max = 0;
925
926 if (align) {
927 ASSERT((align & (align - 1)) == 0); /* Power of two */
928 ASSERT(align >= SPL_KMEM_CACHE_ALIGN); /* Minimum size */
929 skc->skc_obj_align = align;
930 }
931
932 /* If none passed select a cache type based on object size */
933 if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM))) {
934 if (P2ROUNDUP(skc->skc_obj_size, skc->skc_obj_align) <
935 (PAGE_SIZE / 8)) {
936 skc->skc_flags |= KMC_KMEM;
937 } else {
938 skc->skc_flags |= KMC_VMEM;
939 }
940 }
941
942 rc = spl_slab_size(skc, &skc->skc_slab_objs, &skc->skc_slab_size);
943 if (rc)
944 GOTO(out, rc);
945
946 rc = spl_magazine_create(skc);
947 if (rc)
948 GOTO(out, rc);
949
950 down_write(&spl_kmem_cache_sem);
951 list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
952 up_write(&spl_kmem_cache_sem);
953
954 RETURN(skc);
955 out:
956 kmem_free(skc->skc_name, skc->skc_name_size);
957 kmem_free(skc, sizeof(*skc));
958 RETURN(NULL);
959 }
960 EXPORT_SYMBOL(spl_kmem_cache_create);
961
962 void
963 spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
964 {
965 spl_kmem_slab_t *sks, *m;
966 ENTRY;
967
968 ASSERT(skc->skc_magic == SKC_MAGIC);
969
970 down_write(&spl_kmem_cache_sem);
971 list_del_init(&skc->skc_list);
972 up_write(&spl_kmem_cache_sem);
973
974 spl_magazine_destroy(skc);
975 spin_lock(&skc->skc_lock);
976
977 /* Validate there are no objects in use and free all the
978 * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */
979 ASSERT(list_empty(&skc->skc_complete_list));
980 ASSERT(skc->skc_slab_alloc == 0);
981 ASSERT(skc->skc_obj_alloc == 0);
982
983 list_for_each_entry_safe(sks, m, &skc->skc_partial_list, sks_list)
984 spl_slab_free(sks);
985
986 ASSERT(skc->skc_slab_total == 0);
987 ASSERT(skc->skc_obj_total == 0);
988
989 kmem_free(skc->skc_name, skc->skc_name_size);
990 spin_unlock(&skc->skc_lock);
991
992 kmem_free(skc, sizeof(*skc));
993
994 EXIT;
995 }
996 EXPORT_SYMBOL(spl_kmem_cache_destroy);
997
998 static void *
999 spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
1000 {
1001 spl_kmem_obj_t *sko;
1002
1003 ASSERT(skc->skc_magic == SKC_MAGIC);
1004 ASSERT(sks->sks_magic == SKS_MAGIC);
1005 ASSERT(spin_is_locked(&skc->skc_lock));
1006
1007 sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
1008 ASSERT(sko->sko_magic == SKO_MAGIC);
1009 ASSERT(sko->sko_addr != NULL);
1010
1011 /* Remove from sks_free_list */
1012 list_del_init(&sko->sko_list);
1013
1014 sks->sks_age = jiffies;
1015 sks->sks_ref++;
1016 skc->skc_obj_alloc++;
1017
1018 /* Track max obj usage statistics */
1019 if (skc->skc_obj_alloc > skc->skc_obj_max)
1020 skc->skc_obj_max = skc->skc_obj_alloc;
1021
1022 /* Track max slab usage statistics */
1023 if (sks->sks_ref == 1) {
1024 skc->skc_slab_alloc++;
1025
1026 if (skc->skc_slab_alloc > skc->skc_slab_max)
1027 skc->skc_slab_max = skc->skc_slab_alloc;
1028 }
1029
1030 return sko->sko_addr;
1031 }
1032
1033 /* No available objects create a new slab. Since this is an
1034 * expensive operation we do it without holding the spinlock
1035 * and only briefly aquire it when we link in the fully
1036 * allocated and constructed slab.
1037 */
1038 static spl_kmem_slab_t *
1039 spl_cache_grow(spl_kmem_cache_t *skc, int flags)
1040 {
1041 spl_kmem_slab_t *sks;
1042 ENTRY;
1043
1044 ASSERT(skc->skc_magic == SKC_MAGIC);
1045
1046 if (flags & __GFP_WAIT) {
1047 flags |= __GFP_NOFAIL;
1048 local_irq_enable();
1049 might_sleep();
1050 }
1051
1052 sks = spl_slab_alloc(skc, flags);
1053 if (sks == NULL) {
1054 if (flags & __GFP_WAIT)
1055 local_irq_disable();
1056
1057 RETURN(NULL);
1058 }
1059
1060 if (flags & __GFP_WAIT)
1061 local_irq_disable();
1062
1063 /* Link the new empty slab in to the end of skc_partial_list */
1064 spin_lock(&skc->skc_lock);
1065 skc->skc_slab_total++;
1066 skc->skc_obj_total += sks->sks_objs;
1067 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1068 spin_unlock(&skc->skc_lock);
1069
1070 RETURN(sks);
1071 }
1072
1073 static int
1074 spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
1075 {
1076 spl_kmem_slab_t *sks;
1077 int rc = 0, refill;
1078 ENTRY;
1079
1080 ASSERT(skc->skc_magic == SKC_MAGIC);
1081 ASSERT(skm->skm_magic == SKM_MAGIC);
1082
1083 /* XXX: Check for refill bouncing by age perhaps */
1084 refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
1085
1086 spin_lock(&skc->skc_lock);
1087
1088 while (refill > 0) {
1089 /* No slabs available we must grow the cache */
1090 if (list_empty(&skc->skc_partial_list)) {
1091 spin_unlock(&skc->skc_lock);
1092
1093 sks = spl_cache_grow(skc, flags);
1094 if (!sks)
1095 GOTO(out, rc);
1096
1097 /* Rescheduled to different CPU skm is not local */
1098 if (skm != skc->skc_mag[smp_processor_id()])
1099 GOTO(out, rc);
1100
1101 /* Potentially rescheduled to the same CPU but
1102 * allocations may have occured from this CPU while
1103 * we were sleeping so recalculate max refill. */
1104 refill = MIN(refill, skm->skm_size - skm->skm_avail);
1105
1106 spin_lock(&skc->skc_lock);
1107 continue;
1108 }
1109
1110 /* Grab the next available slab */
1111 sks = list_entry((&skc->skc_partial_list)->next,
1112 spl_kmem_slab_t, sks_list);
1113 ASSERT(sks->sks_magic == SKS_MAGIC);
1114 ASSERT(sks->sks_ref < sks->sks_objs);
1115 ASSERT(!list_empty(&sks->sks_free_list));
1116
1117 /* Consume as many objects as needed to refill the requested
1118 * cache. We must also be careful not to overfill it. */
1119 while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++rc) {
1120 ASSERT(skm->skm_avail < skm->skm_size);
1121 ASSERT(rc < skm->skm_size);
1122 skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
1123 }
1124
1125 /* Move slab to skc_complete_list when full */
1126 if (sks->sks_ref == sks->sks_objs) {
1127 list_del(&sks->sks_list);
1128 list_add(&sks->sks_list, &skc->skc_complete_list);
1129 }
1130 }
1131
1132 spin_unlock(&skc->skc_lock);
1133 out:
1134 /* Returns the number of entries added to cache */
1135 RETURN(rc);
1136 }
1137
1138 static void
1139 spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
1140 {
1141 spl_kmem_slab_t *sks = NULL;
1142 spl_kmem_obj_t *sko = NULL;
1143 ENTRY;
1144
1145 ASSERT(skc->skc_magic == SKC_MAGIC);
1146 ASSERT(spin_is_locked(&skc->skc_lock));
1147
1148 sko = obj + P2ROUNDUP(skc->skc_obj_size, skc->skc_obj_align);
1149 ASSERT(sko->sko_magic == SKO_MAGIC);
1150
1151 sks = sko->sko_slab;
1152 ASSERT(sks->sks_magic == SKS_MAGIC);
1153 ASSERT(sks->sks_cache == skc);
1154 list_add(&sko->sko_list, &sks->sks_free_list);
1155
1156 sks->sks_age = jiffies;
1157 sks->sks_ref--;
1158 skc->skc_obj_alloc--;
1159
1160 /* Move slab to skc_partial_list when no longer full. Slabs
1161 * are added to the head to keep the partial list is quasi-full
1162 * sorted order. Fuller at the head, emptier at the tail. */
1163 if (sks->sks_ref == (sks->sks_objs - 1)) {
1164 list_del(&sks->sks_list);
1165 list_add(&sks->sks_list, &skc->skc_partial_list);
1166 }
1167
1168 /* Move emply slabs to the end of the partial list so
1169 * they can be easily found and freed during reclamation. */
1170 if (sks->sks_ref == 0) {
1171 list_del(&sks->sks_list);
1172 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1173 skc->skc_slab_alloc--;
1174 }
1175
1176 EXIT;
1177 }
1178
1179 static int
1180 spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
1181 {
1182 int i, count = MIN(flush, skm->skm_avail);
1183 ENTRY;
1184
1185 ASSERT(skc->skc_magic == SKC_MAGIC);
1186 ASSERT(skm->skm_magic == SKM_MAGIC);
1187
1188 spin_lock(&skc->skc_lock);
1189
1190 for (i = 0; i < count; i++)
1191 spl_cache_shrink(skc, skm->skm_objs[i]);
1192
1193 // __spl_slab_reclaim(skc);
1194 skm->skm_avail -= count;
1195 memmove(skm->skm_objs, &(skm->skm_objs[count]),
1196 sizeof(void *) * skm->skm_avail);
1197
1198 spin_unlock(&skc->skc_lock);
1199
1200 RETURN(count);
1201 }
1202
1203 void *
1204 spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
1205 {
1206 spl_kmem_magazine_t *skm;
1207 unsigned long irq_flags;
1208 void *obj = NULL;
1209 ENTRY;
1210
1211 ASSERT(skc->skc_magic == SKC_MAGIC);
1212 ASSERT(flags & KM_SLEEP); /* XXX: KM_NOSLEEP not yet supported */
1213 local_irq_save(irq_flags);
1214
1215 restart:
1216 /* Safe to update per-cpu structure without lock, but
1217 * in the restart case we must be careful to reaquire
1218 * the local magazine since this may have changed
1219 * when we need to grow the cache. */
1220 skm = skc->skc_mag[smp_processor_id()];
1221 ASSERTF(skm->skm_magic == SKM_MAGIC, "%x != %x: %s/%p/%p %x/%x/%x\n",
1222 skm->skm_magic, SKM_MAGIC, skc->skc_name, skc, skm,
1223 skm->skm_size, skm->skm_refill, skm->skm_avail);
1224
1225 if (likely(skm->skm_avail)) {
1226 /* Object available in CPU cache, use it */
1227 obj = skm->skm_objs[--skm->skm_avail];
1228 if (!(skc->skc_flags & KMC_NOTOUCH))
1229 skm->skm_age = jiffies;
1230 } else {
1231 /* Per-CPU cache empty, directly allocate from
1232 * the slab and refill the per-CPU cache. */
1233 (void)spl_cache_refill(skc, skm, flags);
1234 GOTO(restart, obj = NULL);
1235 }
1236
1237 local_irq_restore(irq_flags);
1238 ASSERT(obj);
1239 ASSERT(((unsigned long)(obj) % skc->skc_obj_align) == 0);
1240
1241 /* Pre-emptively migrate object to CPU L1 cache */
1242 prefetchw(obj);
1243
1244 RETURN(obj);
1245 }
1246 EXPORT_SYMBOL(spl_kmem_cache_alloc);
1247
1248 void
1249 spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
1250 {
1251 spl_kmem_magazine_t *skm;
1252 unsigned long flags;
1253 ENTRY;
1254
1255 ASSERT(skc->skc_magic == SKC_MAGIC);
1256 local_irq_save(flags);
1257
1258 /* Safe to update per-cpu structure without lock, but
1259 * no remote memory allocation tracking is being performed
1260 * it is entirely possible to allocate an object from one
1261 * CPU cache and return it to another. */
1262 skm = skc->skc_mag[smp_processor_id()];
1263 ASSERT(skm->skm_magic == SKM_MAGIC);
1264
1265 /* Per-CPU cache full, flush it to make space */
1266 if (unlikely(skm->skm_avail >= skm->skm_size))
1267 (void)spl_cache_flush(skc, skm, skm->skm_refill);
1268
1269 /* Available space in cache, use it */
1270 skm->skm_objs[skm->skm_avail++] = obj;
1271
1272 local_irq_restore(flags);
1273
1274 EXIT;
1275 }
1276 EXPORT_SYMBOL(spl_kmem_cache_free);
1277
1278 static int
1279 spl_kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask)
1280 {
1281 spl_kmem_cache_t *skc;
1282
1283 /* Under linux a shrinker is not tightly coupled with a slab
1284 * cache. In fact linux always systematically trys calling all
1285 * registered shrinker callbacks until its target reclamation level
1286 * is reached. Because of this we only register one shrinker
1287 * function in the shim layer for all slab caches. And we always
1288 * attempt to shrink all caches when this generic shrinker is called.
1289 */
1290 down_read(&spl_kmem_cache_sem);
1291
1292 list_for_each_entry(skc, &spl_kmem_cache_list, skc_list)
1293 spl_kmem_cache_reap_now(skc);
1294
1295 up_read(&spl_kmem_cache_sem);
1296
1297 /* XXX: Under linux we should return the remaining number of
1298 * entries in the cache. We should do this as well.
1299 */
1300 return 1;
1301 }
1302
1303 void
1304 spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
1305 {
1306 spl_kmem_magazine_t *skm;
1307 int i;
1308 ENTRY;
1309
1310 ASSERT(skc->skc_magic == SKC_MAGIC);
1311
1312 if (skc->skc_reclaim)
1313 skc->skc_reclaim(skc->skc_private);
1314
1315 /* Ensure per-CPU caches which are idle gradually flush */
1316 for_each_online_cpu(i) {
1317 skm = skc->skc_mag[i];
1318
1319 if (time_after(jiffies, skm->skm_age + skc->skc_delay * HZ))
1320 (void)spl_cache_flush(skc, skm, skm->skm_refill);
1321 }
1322
1323 spl_slab_reclaim(skc);
1324
1325 EXIT;
1326 }
1327 EXPORT_SYMBOL(spl_kmem_cache_reap_now);
1328
1329 void
1330 spl_kmem_reap(void)
1331 {
1332 spl_kmem_cache_generic_shrinker(KMC_REAP_CHUNK, GFP_KERNEL);
1333 }
1334 EXPORT_SYMBOL(spl_kmem_reap);
1335
1336 #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
1337 static char *
1338 spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
1339 {
1340 int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
1341 int i, flag = 1;
1342
1343 ASSERT(str != NULL && len >= 17);
1344 memset(str, 0, len);
1345
1346 /* Check for a fully printable string, and while we are at
1347 * it place the printable characters in the passed buffer. */
1348 for (i = 0; i < size; i++) {
1349 str[i] = ((char *)(kd->kd_addr))[i];
1350 if (isprint(str[i])) {
1351 continue;
1352 } else {
1353 /* Minimum number of printable characters found
1354 * to make it worthwhile to print this as ascii. */
1355 if (i > min)
1356 break;
1357
1358 flag = 0;
1359 break;
1360 }
1361 }
1362
1363 if (!flag) {
1364 sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
1365 *((uint8_t *)kd->kd_addr),
1366 *((uint8_t *)kd->kd_addr + 2),
1367 *((uint8_t *)kd->kd_addr + 4),
1368 *((uint8_t *)kd->kd_addr + 6),
1369 *((uint8_t *)kd->kd_addr + 8),
1370 *((uint8_t *)kd->kd_addr + 10),
1371 *((uint8_t *)kd->kd_addr + 12),
1372 *((uint8_t *)kd->kd_addr + 14));
1373 }
1374
1375 return str;
1376 }
1377
1378 static int
1379 spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
1380 {
1381 int i;
1382 ENTRY;
1383
1384 spin_lock_init(lock);
1385 INIT_LIST_HEAD(list);
1386
1387 for (i = 0; i < size; i++)
1388 INIT_HLIST_HEAD(&kmem_table[i]);
1389
1390 RETURN(0);
1391 }
1392
1393 static void
1394 spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
1395 {
1396 unsigned long flags;
1397 kmem_debug_t *kd;
1398 char str[17];
1399 ENTRY;
1400
1401 spin_lock_irqsave(lock, flags);
1402 if (!list_empty(list))
1403 printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
1404 "size", "data", "func", "line");
1405
1406 list_for_each_entry(kd, list, kd_list)
1407 printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
1408 (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
1409 kd->kd_func, kd->kd_line);
1410
1411 spin_unlock_irqrestore(lock, flags);
1412 EXIT;
1413 }
1414 #else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
1415 #define spl_kmem_init_tracking(list, lock, size)
1416 #define spl_kmem_fini_tracking(list, lock)
1417 #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
1418
1419 int
1420 spl_kmem_init(void)
1421 {
1422 int rc = 0;
1423 ENTRY;
1424
1425 init_rwsem(&spl_kmem_cache_sem);
1426 INIT_LIST_HEAD(&spl_kmem_cache_list);
1427
1428 #ifdef HAVE_SET_SHRINKER
1429 spl_kmem_cache_shrinker = set_shrinker(KMC_DEFAULT_SEEKS,
1430 spl_kmem_cache_generic_shrinker);
1431 if (spl_kmem_cache_shrinker == NULL)
1432 RETURN(rc = -ENOMEM);
1433 #else
1434 register_shrinker(&spl_kmem_cache_shrinker);
1435 #endif
1436
1437 #ifdef DEBUG_KMEM
1438 atomic64_set(&kmem_alloc_used, 0);
1439 atomic64_set(&vmem_alloc_used, 0);
1440
1441 spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
1442 spl_kmem_init_tracking(&vmem_list, &vmem_lock, VMEM_TABLE_SIZE);
1443 #endif
1444 RETURN(rc);
1445 }
1446
1447 void
1448 spl_kmem_fini(void)
1449 {
1450 #ifdef DEBUG_KMEM
1451 /* Display all unreclaimed memory addresses, including the
1452 * allocation size and the first few bytes of what's located
1453 * at that address to aid in debugging. Performance is not
1454 * a serious concern here since it is module unload time. */
1455 if (atomic64_read(&kmem_alloc_used) != 0)
1456 CWARN("kmem leaked %ld/%ld bytes\n",
1457 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
1458
1459
1460 if (atomic64_read(&vmem_alloc_used) != 0)
1461 CWARN("vmem leaked %ld/%ld bytes\n",
1462 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
1463
1464 spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
1465 spl_kmem_fini_tracking(&vmem_list, &vmem_lock);
1466 #endif /* DEBUG_KMEM */
1467 ENTRY;
1468
1469 #ifdef HAVE_SET_SHRINKER
1470 remove_shrinker(spl_kmem_cache_shrinker);
1471 #else
1472 unregister_shrinker(&spl_kmem_cache_shrinker);
1473 #endif
1474
1475 EXIT;
1476 }