]> git.proxmox.com Git - mirror_spl.git/blob - module/spl/spl-kmem.c
37849f504e7a54fe9cc951126129c03f6111df95
[mirror_spl.git] / module / spl / spl-kmem.c
1 /*****************************************************************************\
2 * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
3 * Copyright (C) 2007 The Regents of the University of California.
4 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
5 * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
6 * UCRL-CODE-235197
7 *
8 * This file is part of the SPL, Solaris Porting Layer.
9 * For details, see <http://zfsonlinux.org/>.
10 *
11 * The SPL is free software; you can redistribute it and/or modify it
12 * under the terms of the GNU General Public License as published by the
13 * Free Software Foundation; either version 2 of the License, or (at your
14 * option) any later version.
15 *
16 * The SPL is distributed in the hope that it will be useful, but WITHOUT
17 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
18 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
19 * for more details.
20 *
21 * You should have received a copy of the GNU General Public License along
22 * with the SPL. If not, see <http://www.gnu.org/licenses/>.
23 *****************************************************************************
24 * Solaris Porting Layer (SPL) Kmem Implementation.
25 \*****************************************************************************/
26
27 #include <sys/kmem.h>
28
29 /*
30 * Within the scope of spl-kmem.c file the kmem_cache_* definitions
31 * are removed to allow access to the real Linux slab allocator.
32 */
33 #undef kmem_cache_destroy
34 #undef kmem_cache_create
35 #undef kmem_cache_alloc
36 #undef kmem_cache_free
37
38
39 /*
40 * Cache expiration was implemented because it was part of the default Solaris
41 * kmem_cache behavior. The idea is that per-cpu objects which haven't been
42 * accessed in several seconds should be returned to the cache. On the other
43 * hand Linux slabs never move objects back to the slabs unless there is
44 * memory pressure on the system. By default the Linux method is enabled
45 * because it has been shown to improve responsiveness on low memory systems.
46 * This policy may be changed by setting KMC_EXPIRE_AGE or KMC_EXPIRE_MEM.
47 */
48 unsigned int spl_kmem_cache_expire = KMC_EXPIRE_MEM;
49 EXPORT_SYMBOL(spl_kmem_cache_expire);
50 module_param(spl_kmem_cache_expire, uint, 0644);
51 MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)");
52
53 /*
54 * The default behavior is to report the number of objects remaining in the
55 * cache. This allows the Linux VM to repeatedly reclaim objects from the
56 * cache when memory is low satisfy other memory allocations. Alternately,
57 * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache
58 * is reclaimed. This may increase the likelihood of out of memory events.
59 */
60 unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */;
61 module_param(spl_kmem_cache_reclaim, uint, 0644);
62 MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)");
63
64 unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB;
65 module_param(spl_kmem_cache_obj_per_slab, uint, 0644);
66 MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab");
67
68 unsigned int spl_kmem_cache_obj_per_slab_min = SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN;
69 module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644);
70 MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min,
71 "Minimal number of objects per slab");
72
73 unsigned int spl_kmem_cache_max_size = 32;
74 module_param(spl_kmem_cache_max_size, uint, 0644);
75 MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB");
76
77 /*
78 * For small objects the Linux slab allocator should be used to make the most
79 * efficient use of the memory. However, large objects are not supported by
80 * the Linux slab and therefore the SPL implementation is preferred. A cutoff
81 * of 16K was determined to be optimal for architectures using 4K pages.
82 */
83 #if PAGE_SIZE == 4096
84 unsigned int spl_kmem_cache_slab_limit = 16384;
85 #else
86 unsigned int spl_kmem_cache_slab_limit = 0;
87 #endif
88 module_param(spl_kmem_cache_slab_limit, uint, 0644);
89 MODULE_PARM_DESC(spl_kmem_cache_slab_limit,
90 "Objects less than N bytes use the Linux slab");
91
92 unsigned int spl_kmem_cache_kmem_limit = (PAGE_SIZE / 4);
93 module_param(spl_kmem_cache_kmem_limit, uint, 0644);
94 MODULE_PARM_DESC(spl_kmem_cache_kmem_limit,
95 "Objects less than N bytes use the kmalloc");
96
97 vmem_t *heap_arena = NULL;
98 EXPORT_SYMBOL(heap_arena);
99
100 vmem_t *zio_alloc_arena = NULL;
101 EXPORT_SYMBOL(zio_alloc_arena);
102
103 vmem_t *zio_arena = NULL;
104 EXPORT_SYMBOL(zio_arena);
105
106 size_t
107 vmem_size(vmem_t *vmp, int typemask)
108 {
109 ASSERT3P(vmp, ==, NULL);
110 ASSERT3S(typemask & VMEM_ALLOC, ==, VMEM_ALLOC);
111 ASSERT3S(typemask & VMEM_FREE, ==, VMEM_FREE);
112
113 return (VMALLOC_TOTAL);
114 }
115 EXPORT_SYMBOL(vmem_size);
116
117 int
118 kmem_debugging(void)
119 {
120 return 0;
121 }
122 EXPORT_SYMBOL(kmem_debugging);
123
124 char *
125 kmem_vasprintf(const char *fmt, va_list ap)
126 {
127 va_list aq;
128 char *ptr;
129
130 do {
131 va_copy(aq, ap);
132 ptr = kvasprintf(GFP_KERNEL, fmt, aq);
133 va_end(aq);
134 } while (ptr == NULL);
135
136 return ptr;
137 }
138 EXPORT_SYMBOL(kmem_vasprintf);
139
140 char *
141 kmem_asprintf(const char *fmt, ...)
142 {
143 va_list ap;
144 char *ptr;
145
146 do {
147 va_start(ap, fmt);
148 ptr = kvasprintf(GFP_KERNEL, fmt, ap);
149 va_end(ap);
150 } while (ptr == NULL);
151
152 return ptr;
153 }
154 EXPORT_SYMBOL(kmem_asprintf);
155
156 static char *
157 __strdup(const char *str, int flags)
158 {
159 char *ptr;
160 int n;
161
162 n = strlen(str);
163 ptr = kmalloc_nofail(n + 1, flags);
164 if (ptr)
165 memcpy(ptr, str, n + 1);
166
167 return ptr;
168 }
169
170 char *
171 strdup(const char *str)
172 {
173 return __strdup(str, KM_SLEEP);
174 }
175 EXPORT_SYMBOL(strdup);
176
177 void
178 strfree(char *str)
179 {
180 kfree(str);
181 }
182 EXPORT_SYMBOL(strfree);
183
184 /*
185 * Memory allocation interfaces and debugging for basic kmem_*
186 * and vmem_* style memory allocation. When DEBUG_KMEM is enabled
187 * the SPL will keep track of the total memory allocated, and
188 * report any memory leaked when the module is unloaded.
189 */
190 #ifdef DEBUG_KMEM
191
192 /* Shim layer memory accounting */
193 # ifdef HAVE_ATOMIC64_T
194 atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
195 unsigned long long kmem_alloc_max = 0;
196 atomic64_t vmem_alloc_used = ATOMIC64_INIT(0);
197 unsigned long long vmem_alloc_max = 0;
198 # else /* HAVE_ATOMIC64_T */
199 atomic_t kmem_alloc_used = ATOMIC_INIT(0);
200 unsigned long long kmem_alloc_max = 0;
201 atomic_t vmem_alloc_used = ATOMIC_INIT(0);
202 unsigned long long vmem_alloc_max = 0;
203 # endif /* HAVE_ATOMIC64_T */
204
205 EXPORT_SYMBOL(kmem_alloc_used);
206 EXPORT_SYMBOL(kmem_alloc_max);
207 EXPORT_SYMBOL(vmem_alloc_used);
208 EXPORT_SYMBOL(vmem_alloc_max);
209
210 /* When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked
211 * but also the location of every alloc and free. When the SPL module is
212 * unloaded a list of all leaked addresses and where they were allocated
213 * will be dumped to the console. Enabling this feature has a significant
214 * impact on performance but it makes finding memory leaks straight forward.
215 *
216 * Not surprisingly with debugging enabled the xmem_locks are very highly
217 * contended particularly on xfree(). If we want to run with this detailed
218 * debugging enabled for anything other than debugging we need to minimize
219 * the contention by moving to a lock per xmem_table entry model.
220 */
221 # ifdef DEBUG_KMEM_TRACKING
222
223 # define KMEM_HASH_BITS 10
224 # define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS)
225
226 # define VMEM_HASH_BITS 10
227 # define VMEM_TABLE_SIZE (1 << VMEM_HASH_BITS)
228
229 typedef struct kmem_debug {
230 struct hlist_node kd_hlist; /* Hash node linkage */
231 struct list_head kd_list; /* List of all allocations */
232 void *kd_addr; /* Allocation pointer */
233 size_t kd_size; /* Allocation size */
234 const char *kd_func; /* Allocation function */
235 int kd_line; /* Allocation line */
236 } kmem_debug_t;
237
238 spinlock_t kmem_lock;
239 struct hlist_head kmem_table[KMEM_TABLE_SIZE];
240 struct list_head kmem_list;
241
242 spinlock_t vmem_lock;
243 struct hlist_head vmem_table[VMEM_TABLE_SIZE];
244 struct list_head vmem_list;
245
246 EXPORT_SYMBOL(kmem_lock);
247 EXPORT_SYMBOL(kmem_table);
248 EXPORT_SYMBOL(kmem_list);
249
250 EXPORT_SYMBOL(vmem_lock);
251 EXPORT_SYMBOL(vmem_table);
252 EXPORT_SYMBOL(vmem_list);
253
254 static kmem_debug_t *
255 kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits, const void *addr)
256 {
257 struct hlist_head *head;
258 struct hlist_node *node;
259 struct kmem_debug *p;
260 unsigned long flags;
261
262 spin_lock_irqsave(lock, flags);
263
264 head = &table[hash_ptr((void *)addr, bits)];
265 hlist_for_each(node, head) {
266 p = list_entry(node, struct kmem_debug, kd_hlist);
267 if (p->kd_addr == addr) {
268 hlist_del_init(&p->kd_hlist);
269 list_del_init(&p->kd_list);
270 spin_unlock_irqrestore(lock, flags);
271 return p;
272 }
273 }
274
275 spin_unlock_irqrestore(lock, flags);
276
277 return (NULL);
278 }
279
280 void *
281 kmem_alloc_track(size_t size, int flags, const char *func, int line,
282 int node_alloc, int node)
283 {
284 void *ptr = NULL;
285 kmem_debug_t *dptr;
286 unsigned long irq_flags;
287
288 /* Function may be called with KM_NOSLEEP so failure is possible */
289 dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t),
290 flags & ~__GFP_ZERO);
291
292 if (unlikely(dptr == NULL)) {
293 printk(KERN_WARNING "debug kmem_alloc(%ld, 0x%x) at %s:%d "
294 "failed (%lld/%llu)\n", sizeof(kmem_debug_t), flags,
295 func, line, kmem_alloc_used_read(), kmem_alloc_max);
296 } else {
297 /*
298 * Marked unlikely because we should never be doing this,
299 * we tolerate to up 2 pages but a single page is best.
300 */
301 if (unlikely((size > PAGE_SIZE*2) && !(flags & KM_NODEBUG))) {
302 printk(KERN_WARNING "large kmem_alloc(%llu, 0x%x) "
303 "at %s:%d failed (%lld/%llu)\n",
304 (unsigned long long)size, flags, func, line,
305 kmem_alloc_used_read(), kmem_alloc_max);
306 spl_dumpstack();
307 }
308
309 /*
310 * We use __strdup() below because the string pointed to by
311 * __FUNCTION__ might not be available by the time we want
312 * to print it since the module might have been unloaded.
313 * This can only fail in the KM_NOSLEEP case.
314 */
315 dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO);
316 if (unlikely(dptr->kd_func == NULL)) {
317 kfree(dptr);
318 printk(KERN_WARNING "debug __strdup() at %s:%d "
319 "failed (%lld/%llu)\n", func, line,
320 kmem_alloc_used_read(), kmem_alloc_max);
321 goto out;
322 }
323
324 /* Use the correct allocator */
325 if (node_alloc) {
326 ASSERT(!(flags & __GFP_ZERO));
327 ptr = kmalloc_node_nofail(size, flags, node);
328 } else if (flags & __GFP_ZERO) {
329 ptr = kzalloc_nofail(size, flags & ~__GFP_ZERO);
330 } else {
331 ptr = kmalloc_nofail(size, flags);
332 }
333
334 if (unlikely(ptr == NULL)) {
335 kfree(dptr->kd_func);
336 kfree(dptr);
337 printk(KERN_WARNING "kmem_alloc(%llu, 0x%x) "
338 "at %s:%d failed (%lld/%llu)\n",
339 (unsigned long long) size, flags, func, line,
340 kmem_alloc_used_read(), kmem_alloc_max);
341 goto out;
342 }
343
344 kmem_alloc_used_add(size);
345 if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
346 kmem_alloc_max = kmem_alloc_used_read();
347
348 INIT_HLIST_NODE(&dptr->kd_hlist);
349 INIT_LIST_HEAD(&dptr->kd_list);
350
351 dptr->kd_addr = ptr;
352 dptr->kd_size = size;
353 dptr->kd_line = line;
354
355 spin_lock_irqsave(&kmem_lock, irq_flags);
356 hlist_add_head(&dptr->kd_hlist,
357 &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
358 list_add_tail(&dptr->kd_list, &kmem_list);
359 spin_unlock_irqrestore(&kmem_lock, irq_flags);
360 }
361 out:
362 return (ptr);
363 }
364 EXPORT_SYMBOL(kmem_alloc_track);
365
366 void
367 kmem_free_track(const void *ptr, size_t size)
368 {
369 kmem_debug_t *dptr;
370
371 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
372 (unsigned long long) size);
373
374 /* Must exist in hash due to kmem_alloc() */
375 dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
376 ASSERT(dptr);
377
378 /* Size must match */
379 ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
380 "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
381 (unsigned long long) size, dptr->kd_func, dptr->kd_line);
382
383 kmem_alloc_used_sub(size);
384 kfree(dptr->kd_func);
385
386 memset((void *)dptr, 0x5a, sizeof(kmem_debug_t));
387 kfree(dptr);
388
389 memset((void *)ptr, 0x5a, size);
390 kfree(ptr);
391 }
392 EXPORT_SYMBOL(kmem_free_track);
393
394 void *
395 vmem_alloc_track(size_t size, int flags, const char *func, int line)
396 {
397 void *ptr = NULL;
398 kmem_debug_t *dptr;
399 unsigned long irq_flags;
400
401 ASSERT(flags & KM_SLEEP);
402
403 /* Function may be called with KM_NOSLEEP so failure is possible */
404 dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t),
405 flags & ~__GFP_ZERO);
406 if (unlikely(dptr == NULL)) {
407 printk(KERN_WARNING "debug vmem_alloc(%ld, 0x%x) "
408 "at %s:%d failed (%lld/%llu)\n",
409 sizeof(kmem_debug_t), flags, func, line,
410 vmem_alloc_used_read(), vmem_alloc_max);
411 } else {
412 /*
413 * We use __strdup() below because the string pointed to by
414 * __FUNCTION__ might not be available by the time we want
415 * to print it, since the module might have been unloaded.
416 * This can never fail because we have already asserted
417 * that flags is KM_SLEEP.
418 */
419 dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO);
420 if (unlikely(dptr->kd_func == NULL)) {
421 kfree(dptr);
422 printk(KERN_WARNING "debug __strdup() at %s:%d "
423 "failed (%lld/%llu)\n", func, line,
424 vmem_alloc_used_read(), vmem_alloc_max);
425 goto out;
426 }
427
428 /* Use the correct allocator */
429 if (flags & __GFP_ZERO) {
430 ptr = vzalloc_nofail(size, flags & ~__GFP_ZERO);
431 } else {
432 ptr = vmalloc_nofail(size, flags);
433 }
434
435 if (unlikely(ptr == NULL)) {
436 kfree(dptr->kd_func);
437 kfree(dptr);
438 printk(KERN_WARNING "vmem_alloc (%llu, 0x%x) "
439 "at %s:%d failed (%lld/%llu)\n",
440 (unsigned long long) size, flags, func, line,
441 vmem_alloc_used_read(), vmem_alloc_max);
442 goto out;
443 }
444
445 vmem_alloc_used_add(size);
446 if (unlikely(vmem_alloc_used_read() > vmem_alloc_max))
447 vmem_alloc_max = vmem_alloc_used_read();
448
449 INIT_HLIST_NODE(&dptr->kd_hlist);
450 INIT_LIST_HEAD(&dptr->kd_list);
451
452 dptr->kd_addr = ptr;
453 dptr->kd_size = size;
454 dptr->kd_line = line;
455
456 spin_lock_irqsave(&vmem_lock, irq_flags);
457 hlist_add_head(&dptr->kd_hlist,
458 &vmem_table[hash_ptr(ptr, VMEM_HASH_BITS)]);
459 list_add_tail(&dptr->kd_list, &vmem_list);
460 spin_unlock_irqrestore(&vmem_lock, irq_flags);
461 }
462 out:
463 return (ptr);
464 }
465 EXPORT_SYMBOL(vmem_alloc_track);
466
467 void
468 vmem_free_track(const void *ptr, size_t size)
469 {
470 kmem_debug_t *dptr;
471
472 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
473 (unsigned long long) size);
474
475 /* Must exist in hash due to vmem_alloc() */
476 dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr);
477 ASSERT(dptr);
478
479 /* Size must match */
480 ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
481 "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
482 (unsigned long long) size, dptr->kd_func, dptr->kd_line);
483
484 vmem_alloc_used_sub(size);
485 kfree(dptr->kd_func);
486
487 memset((void *)dptr, 0x5a, sizeof(kmem_debug_t));
488 kfree(dptr);
489
490 memset((void *)ptr, 0x5a, size);
491 vfree(ptr);
492 }
493 EXPORT_SYMBOL(vmem_free_track);
494
495 # else /* DEBUG_KMEM_TRACKING */
496
497 void *
498 kmem_alloc_debug(size_t size, int flags, const char *func, int line,
499 int node_alloc, int node)
500 {
501 void *ptr;
502
503 /*
504 * Marked unlikely because we should never be doing this,
505 * we tolerate to up 2 pages but a single page is best.
506 */
507 if (unlikely((size > PAGE_SIZE * 2) && !(flags & KM_NODEBUG))) {
508 printk(KERN_WARNING
509 "large kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n",
510 (unsigned long long)size, flags, func, line,
511 (unsigned long long)kmem_alloc_used_read(), kmem_alloc_max);
512 spl_dumpstack();
513 }
514
515 /* Use the correct allocator */
516 if (node_alloc) {
517 ASSERT(!(flags & __GFP_ZERO));
518 ptr = kmalloc_node_nofail(size, flags, node);
519 } else if (flags & __GFP_ZERO) {
520 ptr = kzalloc_nofail(size, flags & (~__GFP_ZERO));
521 } else {
522 ptr = kmalloc_nofail(size, flags);
523 }
524
525 if (unlikely(ptr == NULL)) {
526 printk(KERN_WARNING
527 "kmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
528 (unsigned long long)size, flags, func, line,
529 (unsigned long long)kmem_alloc_used_read(), kmem_alloc_max);
530 } else {
531 kmem_alloc_used_add(size);
532 if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
533 kmem_alloc_max = kmem_alloc_used_read();
534 }
535
536 return (ptr);
537 }
538 EXPORT_SYMBOL(kmem_alloc_debug);
539
540 void
541 kmem_free_debug(const void *ptr, size_t size)
542 {
543 ASSERT(ptr || size > 0);
544 kmem_alloc_used_sub(size);
545 kfree(ptr);
546 }
547 EXPORT_SYMBOL(kmem_free_debug);
548
549 void *
550 vmem_alloc_debug(size_t size, int flags, const char *func, int line)
551 {
552 void *ptr;
553
554 ASSERT(flags & KM_SLEEP);
555
556 /* Use the correct allocator */
557 if (flags & __GFP_ZERO) {
558 ptr = vzalloc_nofail(size, flags & (~__GFP_ZERO));
559 } else {
560 ptr = vmalloc_nofail(size, flags);
561 }
562
563 if (unlikely(ptr == NULL)) {
564 printk(KERN_WARNING
565 "vmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
566 (unsigned long long)size, flags, func, line,
567 (unsigned long long)vmem_alloc_used_read(), vmem_alloc_max);
568 } else {
569 vmem_alloc_used_add(size);
570 if (unlikely(vmem_alloc_used_read() > vmem_alloc_max))
571 vmem_alloc_max = vmem_alloc_used_read();
572 }
573
574 return (ptr);
575 }
576 EXPORT_SYMBOL(vmem_alloc_debug);
577
578 void
579 vmem_free_debug(const void *ptr, size_t size)
580 {
581 ASSERT(ptr || size > 0);
582 vmem_alloc_used_sub(size);
583 vfree(ptr);
584 }
585 EXPORT_SYMBOL(vmem_free_debug);
586
587 # endif /* DEBUG_KMEM_TRACKING */
588 #endif /* DEBUG_KMEM */
589
590 /*
591 * Slab allocation interfaces
592 *
593 * While the Linux slab implementation was inspired by the Solaris
594 * implementation I cannot use it to emulate the Solaris APIs. I
595 * require two features which are not provided by the Linux slab.
596 *
597 * 1) Constructors AND destructors. Recent versions of the Linux
598 * kernel have removed support for destructors. This is a deal
599 * breaker for the SPL which contains particularly expensive
600 * initializers for mutex's, condition variables, etc. We also
601 * require a minimal level of cleanup for these data types unlike
602 * many Linux data type which do need to be explicitly destroyed.
603 *
604 * 2) Virtual address space backed slab. Callers of the Solaris slab
605 * expect it to work well for both small are very large allocations.
606 * Because of memory fragmentation the Linux slab which is backed
607 * by kmalloc'ed memory performs very badly when confronted with
608 * large numbers of large allocations. Basing the slab on the
609 * virtual address space removes the need for contiguous pages
610 * and greatly improve performance for large allocations.
611 *
612 * For these reasons, the SPL has its own slab implementation with
613 * the needed features. It is not as highly optimized as either the
614 * Solaris or Linux slabs, but it should get me most of what is
615 * needed until it can be optimized or obsoleted by another approach.
616 *
617 * One serious concern I do have about this method is the relatively
618 * small virtual address space on 32bit arches. This will seriously
619 * constrain the size of the slab caches and their performance.
620 *
621 * XXX: Improve the partial slab list by carefully maintaining a
622 * strict ordering of fullest to emptiest slabs based on
623 * the slab reference count. This guarantees the when freeing
624 * slabs back to the system we need only linearly traverse the
625 * last N slabs in the list to discover all the freeable slabs.
626 *
627 * XXX: NUMA awareness for optionally allocating memory close to a
628 * particular core. This can be advantageous if you know the slab
629 * object will be short lived and primarily accessed from one core.
630 *
631 * XXX: Slab coloring may also yield performance improvements and would
632 * be desirable to implement.
633 */
634
635 struct list_head spl_kmem_cache_list; /* List of caches */
636 struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
637 taskq_t *spl_kmem_cache_taskq; /* Task queue for ageing / reclaim */
638
639 static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj);
640
641 SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker);
642 SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker,
643 spl_kmem_cache_generic_shrinker, KMC_DEFAULT_SEEKS);
644
645 static void *
646 kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
647 {
648 void *ptr;
649
650 ASSERT(ISP2(size));
651
652 if (skc->skc_flags & KMC_KMEM)
653 ptr = (void *)__get_free_pages(flags | __GFP_COMP,
654 get_order(size));
655 else
656 ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
657
658 /* Resulting allocated memory will be page aligned */
659 ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
660
661 return ptr;
662 }
663
664 static void
665 kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
666 {
667 ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
668 ASSERT(ISP2(size));
669
670 /*
671 * The Linux direct reclaim path uses this out of band value to
672 * determine if forward progress is being made. Normally this is
673 * incremented by kmem_freepages() which is part of the various
674 * Linux slab implementations. However, since we are using none
675 * of that infrastructure we are responsible for incrementing it.
676 */
677 if (current->reclaim_state)
678 current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
679
680 if (skc->skc_flags & KMC_KMEM)
681 free_pages((unsigned long)ptr, get_order(size));
682 else
683 vfree(ptr);
684 }
685
686 /*
687 * Required space for each aligned sks.
688 */
689 static inline uint32_t
690 spl_sks_size(spl_kmem_cache_t *skc)
691 {
692 return P2ROUNDUP_TYPED(sizeof(spl_kmem_slab_t),
693 skc->skc_obj_align, uint32_t);
694 }
695
696 /*
697 * Required space for each aligned object.
698 */
699 static inline uint32_t
700 spl_obj_size(spl_kmem_cache_t *skc)
701 {
702 uint32_t align = skc->skc_obj_align;
703
704 return P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) +
705 P2ROUNDUP_TYPED(sizeof(spl_kmem_obj_t), align, uint32_t);
706 }
707
708 /*
709 * Lookup the spl_kmem_object_t for an object given that object.
710 */
711 static inline spl_kmem_obj_t *
712 spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj)
713 {
714 return obj + P2ROUNDUP_TYPED(skc->skc_obj_size,
715 skc->skc_obj_align, uint32_t);
716 }
717
718 /*
719 * Required space for each offslab object taking in to account alignment
720 * restrictions and the power-of-two requirement of kv_alloc().
721 */
722 static inline uint32_t
723 spl_offslab_size(spl_kmem_cache_t *skc)
724 {
725 return 1UL << (fls64(spl_obj_size(skc)) + 1);
726 }
727
728 /*
729 * It's important that we pack the spl_kmem_obj_t structure and the
730 * actual objects in to one large address space to minimize the number
731 * of calls to the allocator. It is far better to do a few large
732 * allocations and then subdivide it ourselves. Now which allocator
733 * we use requires balancing a few trade offs.
734 *
735 * For small objects we use kmem_alloc() because as long as you are
736 * only requesting a small number of pages (ideally just one) its cheap.
737 * However, when you start requesting multiple pages with kmem_alloc()
738 * it gets increasingly expensive since it requires contiguous pages.
739 * For this reason we shift to vmem_alloc() for slabs of large objects
740 * which removes the need for contiguous pages. We do not use
741 * vmem_alloc() in all cases because there is significant locking
742 * overhead in __get_vm_area_node(). This function takes a single
743 * global lock when acquiring an available virtual address range which
744 * serializes all vmem_alloc()'s for all slab caches. Using slightly
745 * different allocation functions for small and large objects should
746 * give us the best of both worlds.
747 *
748 * KMC_ONSLAB KMC_OFFSLAB
749 *
750 * +------------------------+ +-----------------+
751 * | spl_kmem_slab_t --+-+ | | spl_kmem_slab_t |---+-+
752 * | skc_obj_size <-+ | | +-----------------+ | |
753 * | spl_kmem_obj_t | | | |
754 * | skc_obj_size <---+ | +-----------------+ | |
755 * | spl_kmem_obj_t | | | skc_obj_size | <-+ |
756 * | ... v | | spl_kmem_obj_t | |
757 * +------------------------+ +-----------------+ v
758 */
759 static spl_kmem_slab_t *
760 spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
761 {
762 spl_kmem_slab_t *sks;
763 spl_kmem_obj_t *sko, *n;
764 void *base, *obj;
765 uint32_t obj_size, offslab_size = 0;
766 int i, rc = 0;
767
768 base = kv_alloc(skc, skc->skc_slab_size, flags);
769 if (base == NULL)
770 return (NULL);
771
772 sks = (spl_kmem_slab_t *)base;
773 sks->sks_magic = SKS_MAGIC;
774 sks->sks_objs = skc->skc_slab_objs;
775 sks->sks_age = jiffies;
776 sks->sks_cache = skc;
777 INIT_LIST_HEAD(&sks->sks_list);
778 INIT_LIST_HEAD(&sks->sks_free_list);
779 sks->sks_ref = 0;
780 obj_size = spl_obj_size(skc);
781
782 if (skc->skc_flags & KMC_OFFSLAB)
783 offslab_size = spl_offslab_size(skc);
784
785 for (i = 0; i < sks->sks_objs; i++) {
786 if (skc->skc_flags & KMC_OFFSLAB) {
787 obj = kv_alloc(skc, offslab_size, flags);
788 if (!obj) {
789 rc = -ENOMEM;
790 goto out;
791 }
792 } else {
793 obj = base + spl_sks_size(skc) + (i * obj_size);
794 }
795
796 ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
797 sko = spl_sko_from_obj(skc, obj);
798 sko->sko_addr = obj;
799 sko->sko_magic = SKO_MAGIC;
800 sko->sko_slab = sks;
801 INIT_LIST_HEAD(&sko->sko_list);
802 list_add_tail(&sko->sko_list, &sks->sks_free_list);
803 }
804
805 out:
806 if (rc) {
807 if (skc->skc_flags & KMC_OFFSLAB)
808 list_for_each_entry_safe(sko, n, &sks->sks_free_list,
809 sko_list)
810 kv_free(skc, sko->sko_addr, offslab_size);
811
812 kv_free(skc, base, skc->skc_slab_size);
813 sks = NULL;
814 }
815
816 return (sks);
817 }
818
819 /*
820 * Remove a slab from complete or partial list, it must be called with
821 * the 'skc->skc_lock' held but the actual free must be performed
822 * outside the lock to prevent deadlocking on vmem addresses.
823 */
824 static void
825 spl_slab_free(spl_kmem_slab_t *sks,
826 struct list_head *sks_list, struct list_head *sko_list)
827 {
828 spl_kmem_cache_t *skc;
829
830 ASSERT(sks->sks_magic == SKS_MAGIC);
831 ASSERT(sks->sks_ref == 0);
832
833 skc = sks->sks_cache;
834 ASSERT(skc->skc_magic == SKC_MAGIC);
835 ASSERT(spin_is_locked(&skc->skc_lock));
836
837 /*
838 * Update slab/objects counters in the cache, then remove the
839 * slab from the skc->skc_partial_list. Finally add the slab
840 * and all its objects in to the private work lists where the
841 * destructors will be called and the memory freed to the system.
842 */
843 skc->skc_obj_total -= sks->sks_objs;
844 skc->skc_slab_total--;
845 list_del(&sks->sks_list);
846 list_add(&sks->sks_list, sks_list);
847 list_splice_init(&sks->sks_free_list, sko_list);
848 }
849
850 /*
851 * Traverses all the partial slabs attached to a cache and free those
852 * which which are currently empty, and have not been touched for
853 * skc_delay seconds to avoid thrashing. The count argument is
854 * passed to optionally cap the number of slabs reclaimed, a count
855 * of zero means try and reclaim everything. When flag is set we
856 * always free an available slab regardless of age.
857 */
858 static void
859 spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
860 {
861 spl_kmem_slab_t *sks, *m;
862 spl_kmem_obj_t *sko, *n;
863 LIST_HEAD(sks_list);
864 LIST_HEAD(sko_list);
865 uint32_t size = 0;
866 int i = 0;
867
868 /*
869 * Move empty slabs and objects which have not been touched in
870 * skc_delay seconds on to private lists to be freed outside
871 * the spin lock. This delay time is important to avoid thrashing
872 * however when flag is set the delay will not be used.
873 */
874 spin_lock(&skc->skc_lock);
875 list_for_each_entry_safe_reverse(sks,m,&skc->skc_partial_list,sks_list){
876 /*
877 * All empty slabs are at the end of skc->skc_partial_list,
878 * therefore once a non-empty slab is found we can stop
879 * scanning. Additionally, stop when reaching the target
880 * reclaim 'count' if a non-zero threshold is given.
881 */
882 if ((sks->sks_ref > 0) || (count && i >= count))
883 break;
884
885 if (time_after(jiffies,sks->sks_age+skc->skc_delay*HZ)||flag) {
886 spl_slab_free(sks, &sks_list, &sko_list);
887 i++;
888 }
889 }
890 spin_unlock(&skc->skc_lock);
891
892 /*
893 * The following two loops ensure all the object destructors are
894 * run, any offslab objects are freed, and the slabs themselves
895 * are freed. This is all done outside the skc->skc_lock since
896 * this allows the destructor to sleep, and allows us to perform
897 * a conditional reschedule when a freeing a large number of
898 * objects and slabs back to the system.
899 */
900 if (skc->skc_flags & KMC_OFFSLAB)
901 size = spl_offslab_size(skc);
902
903 list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
904 ASSERT(sko->sko_magic == SKO_MAGIC);
905
906 if (skc->skc_flags & KMC_OFFSLAB)
907 kv_free(skc, sko->sko_addr, size);
908 }
909
910 list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
911 ASSERT(sks->sks_magic == SKS_MAGIC);
912 kv_free(skc, sks, skc->skc_slab_size);
913 }
914 }
915
916 static spl_kmem_emergency_t *
917 spl_emergency_search(struct rb_root *root, void *obj)
918 {
919 struct rb_node *node = root->rb_node;
920 spl_kmem_emergency_t *ske;
921 unsigned long address = (unsigned long)obj;
922
923 while (node) {
924 ske = container_of(node, spl_kmem_emergency_t, ske_node);
925
926 if (address < (unsigned long)ske->ske_obj)
927 node = node->rb_left;
928 else if (address > (unsigned long)ske->ske_obj)
929 node = node->rb_right;
930 else
931 return ske;
932 }
933
934 return NULL;
935 }
936
937 static int
938 spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
939 {
940 struct rb_node **new = &(root->rb_node), *parent = NULL;
941 spl_kmem_emergency_t *ske_tmp;
942 unsigned long address = (unsigned long)ske->ske_obj;
943
944 while (*new) {
945 ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
946
947 parent = *new;
948 if (address < (unsigned long)ske_tmp->ske_obj)
949 new = &((*new)->rb_left);
950 else if (address > (unsigned long)ske_tmp->ske_obj)
951 new = &((*new)->rb_right);
952 else
953 return 0;
954 }
955
956 rb_link_node(&ske->ske_node, parent, new);
957 rb_insert_color(&ske->ske_node, root);
958
959 return 1;
960 }
961
962 /*
963 * Allocate a single emergency object and track it in a red black tree.
964 */
965 static int
966 spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
967 {
968 spl_kmem_emergency_t *ske;
969 int empty;
970
971 /* Last chance use a partial slab if one now exists */
972 spin_lock(&skc->skc_lock);
973 empty = list_empty(&skc->skc_partial_list);
974 spin_unlock(&skc->skc_lock);
975 if (!empty)
976 return (-EEXIST);
977
978 ske = kmalloc(sizeof(*ske), flags);
979 if (ske == NULL)
980 return (-ENOMEM);
981
982 ske->ske_obj = kmalloc(skc->skc_obj_size, flags);
983 if (ske->ske_obj == NULL) {
984 kfree(ske);
985 return (-ENOMEM);
986 }
987
988 spin_lock(&skc->skc_lock);
989 empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
990 if (likely(empty)) {
991 skc->skc_obj_total++;
992 skc->skc_obj_emergency++;
993 if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
994 skc->skc_obj_emergency_max = skc->skc_obj_emergency;
995 }
996 spin_unlock(&skc->skc_lock);
997
998 if (unlikely(!empty)) {
999 kfree(ske->ske_obj);
1000 kfree(ske);
1001 return (-EINVAL);
1002 }
1003
1004 *obj = ske->ske_obj;
1005
1006 return (0);
1007 }
1008
1009 /*
1010 * Locate the passed object in the red black tree and free it.
1011 */
1012 static int
1013 spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
1014 {
1015 spl_kmem_emergency_t *ske;
1016
1017 spin_lock(&skc->skc_lock);
1018 ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
1019 if (likely(ske)) {
1020 rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
1021 skc->skc_obj_emergency--;
1022 skc->skc_obj_total--;
1023 }
1024 spin_unlock(&skc->skc_lock);
1025
1026 if (unlikely(ske == NULL))
1027 return (-ENOENT);
1028
1029 kfree(ske->ske_obj);
1030 kfree(ske);
1031
1032 return (0);
1033 }
1034
1035 /*
1036 * Release objects from the per-cpu magazine back to their slab. The flush
1037 * argument contains the max number of entries to remove from the magazine.
1038 */
1039 static void
1040 __spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
1041 {
1042 int i, count = MIN(flush, skm->skm_avail);
1043
1044 ASSERT(skc->skc_magic == SKC_MAGIC);
1045 ASSERT(skm->skm_magic == SKM_MAGIC);
1046 ASSERT(spin_is_locked(&skc->skc_lock));
1047
1048 for (i = 0; i < count; i++)
1049 spl_cache_shrink(skc, skm->skm_objs[i]);
1050
1051 skm->skm_avail -= count;
1052 memmove(skm->skm_objs, &(skm->skm_objs[count]),
1053 sizeof(void *) * skm->skm_avail);
1054 }
1055
1056 static void
1057 spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
1058 {
1059 spin_lock(&skc->skc_lock);
1060 __spl_cache_flush(skc, skm, flush);
1061 spin_unlock(&skc->skc_lock);
1062 }
1063
1064 static void
1065 spl_magazine_age(void *data)
1066 {
1067 spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
1068 spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
1069
1070 ASSERT(skm->skm_magic == SKM_MAGIC);
1071 ASSERT(skm->skm_cpu == smp_processor_id());
1072 ASSERT(irqs_disabled());
1073
1074 /* There are no available objects or they are too young to age out */
1075 if ((skm->skm_avail == 0) ||
1076 time_before(jiffies, skm->skm_age + skc->skc_delay * HZ))
1077 return;
1078
1079 /*
1080 * Because we're executing in interrupt context we may have
1081 * interrupted the holder of this lock. To avoid a potential
1082 * deadlock return if the lock is contended.
1083 */
1084 if (!spin_trylock(&skc->skc_lock))
1085 return;
1086
1087 __spl_cache_flush(skc, skm, skm->skm_refill);
1088 spin_unlock(&skc->skc_lock);
1089 }
1090
1091 /*
1092 * Called regularly to keep a downward pressure on the cache.
1093 *
1094 * Objects older than skc->skc_delay seconds in the per-cpu magazines will
1095 * be returned to the caches. This is done to prevent idle magazines from
1096 * holding memory which could be better used elsewhere. The delay is
1097 * present to prevent thrashing the magazine.
1098 *
1099 * The newly released objects may result in empty partial slabs. Those
1100 * slabs should be released to the system. Otherwise moving the objects
1101 * out of the magazines is just wasted work.
1102 */
1103 static void
1104 spl_cache_age(void *data)
1105 {
1106 spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
1107 taskqid_t id = 0;
1108
1109 ASSERT(skc->skc_magic == SKC_MAGIC);
1110
1111 /* Dynamically disabled at run time */
1112 if (!(spl_kmem_cache_expire & KMC_EXPIRE_AGE))
1113 return;
1114
1115 atomic_inc(&skc->skc_ref);
1116
1117 if (!(skc->skc_flags & KMC_NOMAGAZINE))
1118 on_each_cpu(spl_magazine_age, skc, 1);
1119
1120 spl_slab_reclaim(skc, skc->skc_reap, 0);
1121
1122 while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) {
1123 id = taskq_dispatch_delay(
1124 spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP,
1125 ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
1126
1127 /* Destroy issued after dispatch immediately cancel it */
1128 if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id)
1129 taskq_cancel_id(spl_kmem_cache_taskq, id);
1130 }
1131
1132 spin_lock(&skc->skc_lock);
1133 skc->skc_taskqid = id;
1134 spin_unlock(&skc->skc_lock);
1135
1136 atomic_dec(&skc->skc_ref);
1137 }
1138
1139 /*
1140 * Size a slab based on the size of each aligned object plus spl_kmem_obj_t.
1141 * When on-slab we want to target spl_kmem_cache_obj_per_slab. However,
1142 * for very small objects we may end up with more than this so as not
1143 * to waste space in the minimal allocation of a single page. Also for
1144 * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min,
1145 * lower than this and we will fail.
1146 */
1147 static int
1148 spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
1149 {
1150 uint32_t sks_size, obj_size, max_size;
1151
1152 if (skc->skc_flags & KMC_OFFSLAB) {
1153 *objs = spl_kmem_cache_obj_per_slab;
1154 *size = P2ROUNDUP(sizeof(spl_kmem_slab_t), PAGE_SIZE);
1155 return (0);
1156 } else {
1157 sks_size = spl_sks_size(skc);
1158 obj_size = spl_obj_size(skc);
1159
1160 if (skc->skc_flags & KMC_KMEM)
1161 max_size = ((uint32_t)1 << (MAX_ORDER-3)) * PAGE_SIZE;
1162 else
1163 max_size = (spl_kmem_cache_max_size * 1024 * 1024);
1164
1165 /* Power of two sized slab */
1166 for (*size = PAGE_SIZE; *size <= max_size; *size *= 2) {
1167 *objs = (*size - sks_size) / obj_size;
1168 if (*objs >= spl_kmem_cache_obj_per_slab)
1169 return (0);
1170 }
1171
1172 /*
1173 * Unable to satisfy target objects per slab, fall back to
1174 * allocating a maximally sized slab and assuming it can
1175 * contain the minimum objects count use it. If not fail.
1176 */
1177 *size = max_size;
1178 *objs = (*size - sks_size) / obj_size;
1179 if (*objs >= (spl_kmem_cache_obj_per_slab_min))
1180 return (0);
1181 }
1182
1183 return (-ENOSPC);
1184 }
1185
1186 /*
1187 * Make a guess at reasonable per-cpu magazine size based on the size of
1188 * each object and the cost of caching N of them in each magazine. Long
1189 * term this should really adapt based on an observed usage heuristic.
1190 */
1191 static int
1192 spl_magazine_size(spl_kmem_cache_t *skc)
1193 {
1194 uint32_t obj_size = spl_obj_size(skc);
1195 int size;
1196
1197 /* Per-magazine sizes below assume a 4Kib page size */
1198 if (obj_size > (PAGE_SIZE * 256))
1199 size = 4; /* Minimum 4Mib per-magazine */
1200 else if (obj_size > (PAGE_SIZE * 32))
1201 size = 16; /* Minimum 2Mib per-magazine */
1202 else if (obj_size > (PAGE_SIZE))
1203 size = 64; /* Minimum 256Kib per-magazine */
1204 else if (obj_size > (PAGE_SIZE / 4))
1205 size = 128; /* Minimum 128Kib per-magazine */
1206 else
1207 size = 256;
1208
1209 return (size);
1210 }
1211
1212 /*
1213 * Allocate a per-cpu magazine to associate with a specific core.
1214 */
1215 static spl_kmem_magazine_t *
1216 spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu)
1217 {
1218 spl_kmem_magazine_t *skm;
1219 int size = sizeof(spl_kmem_magazine_t) +
1220 sizeof(void *) * skc->skc_mag_size;
1221
1222 skm = kmem_alloc_node(size, KM_SLEEP, cpu_to_node(cpu));
1223 if (skm) {
1224 skm->skm_magic = SKM_MAGIC;
1225 skm->skm_avail = 0;
1226 skm->skm_size = skc->skc_mag_size;
1227 skm->skm_refill = skc->skc_mag_refill;
1228 skm->skm_cache = skc;
1229 skm->skm_age = jiffies;
1230 skm->skm_cpu = cpu;
1231 }
1232
1233 return (skm);
1234 }
1235
1236 /*
1237 * Free a per-cpu magazine associated with a specific core.
1238 */
1239 static void
1240 spl_magazine_free(spl_kmem_magazine_t *skm)
1241 {
1242 int size = sizeof(spl_kmem_magazine_t) +
1243 sizeof(void *) * skm->skm_size;
1244
1245 ASSERT(skm->skm_magic == SKM_MAGIC);
1246 ASSERT(skm->skm_avail == 0);
1247
1248 kmem_free(skm, size);
1249 }
1250
1251 /*
1252 * Create all pre-cpu magazines of reasonable sizes.
1253 */
1254 static int
1255 spl_magazine_create(spl_kmem_cache_t *skc)
1256 {
1257 int i;
1258
1259 if (skc->skc_flags & KMC_NOMAGAZINE)
1260 return (0);
1261
1262 skc->skc_mag_size = spl_magazine_size(skc);
1263 skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
1264
1265 for_each_online_cpu(i) {
1266 skc->skc_mag[i] = spl_magazine_alloc(skc, i);
1267 if (!skc->skc_mag[i]) {
1268 for (i--; i >= 0; i--)
1269 spl_magazine_free(skc->skc_mag[i]);
1270
1271 return (-ENOMEM);
1272 }
1273 }
1274
1275 return (0);
1276 }
1277
1278 /*
1279 * Destroy all pre-cpu magazines.
1280 */
1281 static void
1282 spl_magazine_destroy(spl_kmem_cache_t *skc)
1283 {
1284 spl_kmem_magazine_t *skm;
1285 int i;
1286
1287 if (skc->skc_flags & KMC_NOMAGAZINE)
1288 return;
1289
1290 for_each_online_cpu(i) {
1291 skm = skc->skc_mag[i];
1292 spl_cache_flush(skc, skm, skm->skm_avail);
1293 spl_magazine_free(skm);
1294 }
1295 }
1296
1297 /*
1298 * Create a object cache based on the following arguments:
1299 * name cache name
1300 * size cache object size
1301 * align cache object alignment
1302 * ctor cache object constructor
1303 * dtor cache object destructor
1304 * reclaim cache object reclaim
1305 * priv cache private data for ctor/dtor/reclaim
1306 * vmp unused must be NULL
1307 * flags
1308 * KMC_NOTOUCH Disable cache object aging (unsupported)
1309 * KMC_NODEBUG Disable debugging (unsupported)
1310 * KMC_NOHASH Disable hashing (unsupported)
1311 * KMC_QCACHE Disable qcache (unsupported)
1312 * KMC_NOMAGAZINE Enabled for kmem/vmem, Disabled for Linux slab
1313 * KMC_KMEM Force kmem backed cache
1314 * KMC_VMEM Force vmem backed cache
1315 * KMC_SLAB Force Linux slab backed cache
1316 * KMC_OFFSLAB Locate objects off the slab
1317 */
1318 spl_kmem_cache_t *
1319 spl_kmem_cache_create(char *name, size_t size, size_t align,
1320 spl_kmem_ctor_t ctor,
1321 spl_kmem_dtor_t dtor,
1322 spl_kmem_reclaim_t reclaim,
1323 void *priv, void *vmp, int flags)
1324 {
1325 spl_kmem_cache_t *skc;
1326 int rc;
1327
1328 /*
1329 * Unsupported flags
1330 */
1331 ASSERT0(flags & KMC_NOMAGAZINE);
1332 ASSERT0(flags & KMC_NOHASH);
1333 ASSERT0(flags & KMC_QCACHE);
1334 ASSERT(vmp == NULL);
1335
1336 might_sleep();
1337
1338 /*
1339 * Allocate memory for a new cache an initialize it. Unfortunately,
1340 * this usually ends up being a large allocation of ~32k because
1341 * we need to allocate enough memory for the worst case number of
1342 * cpus in the magazine, skc_mag[NR_CPUS]. Because of this we
1343 * explicitly pass KM_NODEBUG to suppress the kmem warning
1344 */
1345 skc = kmem_zalloc(sizeof(*skc), KM_SLEEP| KM_NODEBUG);
1346 if (skc == NULL)
1347 return (NULL);
1348
1349 skc->skc_magic = SKC_MAGIC;
1350 skc->skc_name_size = strlen(name) + 1;
1351 skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, KM_SLEEP);
1352 if (skc->skc_name == NULL) {
1353 kmem_free(skc, sizeof(*skc));
1354 return (NULL);
1355 }
1356 strncpy(skc->skc_name, name, skc->skc_name_size);
1357
1358 skc->skc_ctor = ctor;
1359 skc->skc_dtor = dtor;
1360 skc->skc_reclaim = reclaim;
1361 skc->skc_private = priv;
1362 skc->skc_vmp = vmp;
1363 skc->skc_linux_cache = NULL;
1364 skc->skc_flags = flags;
1365 skc->skc_obj_size = size;
1366 skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
1367 skc->skc_delay = SPL_KMEM_CACHE_DELAY;
1368 skc->skc_reap = SPL_KMEM_CACHE_REAP;
1369 atomic_set(&skc->skc_ref, 0);
1370
1371 INIT_LIST_HEAD(&skc->skc_list);
1372 INIT_LIST_HEAD(&skc->skc_complete_list);
1373 INIT_LIST_HEAD(&skc->skc_partial_list);
1374 skc->skc_emergency_tree = RB_ROOT;
1375 spin_lock_init(&skc->skc_lock);
1376 init_waitqueue_head(&skc->skc_waitq);
1377 skc->skc_slab_fail = 0;
1378 skc->skc_slab_create = 0;
1379 skc->skc_slab_destroy = 0;
1380 skc->skc_slab_total = 0;
1381 skc->skc_slab_alloc = 0;
1382 skc->skc_slab_max = 0;
1383 skc->skc_obj_total = 0;
1384 skc->skc_obj_alloc = 0;
1385 skc->skc_obj_max = 0;
1386 skc->skc_obj_deadlock = 0;
1387 skc->skc_obj_emergency = 0;
1388 skc->skc_obj_emergency_max = 0;
1389
1390 /*
1391 * Verify the requested alignment restriction is sane.
1392 */
1393 if (align) {
1394 VERIFY(ISP2(align));
1395 VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN);
1396 VERIFY3U(align, <=, PAGE_SIZE);
1397 skc->skc_obj_align = align;
1398 }
1399
1400 /*
1401 * When no specific type of slab is requested (kmem, vmem, or
1402 * linuxslab) then select a cache type based on the object size
1403 * and default tunables.
1404 */
1405 if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB))) {
1406
1407 /*
1408 * Objects smaller than spl_kmem_cache_slab_limit can
1409 * use the Linux slab for better space-efficiency. By
1410 * default this functionality is disabled until its
1411 * performance characters are fully understood.
1412 */
1413 if (spl_kmem_cache_slab_limit &&
1414 size <= (size_t)spl_kmem_cache_slab_limit)
1415 skc->skc_flags |= KMC_SLAB;
1416
1417 /*
1418 * Small objects, less than spl_kmem_cache_kmem_limit per
1419 * object should use kmem because their slabs are small.
1420 */
1421 else if (spl_obj_size(skc) <= spl_kmem_cache_kmem_limit)
1422 skc->skc_flags |= KMC_KMEM;
1423
1424 /*
1425 * All other objects are considered large and are placed
1426 * on vmem backed slabs.
1427 */
1428 else
1429 skc->skc_flags |= KMC_VMEM;
1430 }
1431
1432 /*
1433 * Given the type of slab allocate the required resources.
1434 */
1435 if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
1436 rc = spl_slab_size(skc,
1437 &skc->skc_slab_objs, &skc->skc_slab_size);
1438 if (rc)
1439 goto out;
1440
1441 rc = spl_magazine_create(skc);
1442 if (rc)
1443 goto out;
1444 } else {
1445 skc->skc_linux_cache = kmem_cache_create(
1446 skc->skc_name, size, align, 0, NULL);
1447 if (skc->skc_linux_cache == NULL) {
1448 rc = ENOMEM;
1449 goto out;
1450 }
1451
1452 kmem_cache_set_allocflags(skc, __GFP_COMP);
1453 skc->skc_flags |= KMC_NOMAGAZINE;
1454 }
1455
1456 if (spl_kmem_cache_expire & KMC_EXPIRE_AGE)
1457 skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq,
1458 spl_cache_age, skc, TQ_SLEEP,
1459 ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
1460
1461 down_write(&spl_kmem_cache_sem);
1462 list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
1463 up_write(&spl_kmem_cache_sem);
1464
1465 return (skc);
1466 out:
1467 kmem_free(skc->skc_name, skc->skc_name_size);
1468 kmem_free(skc, sizeof(*skc));
1469 return (NULL);
1470 }
1471 EXPORT_SYMBOL(spl_kmem_cache_create);
1472
1473 /*
1474 * Register a move callback to for cache defragmentation.
1475 * XXX: Unimplemented but harmless to stub out for now.
1476 */
1477 void
1478 spl_kmem_cache_set_move(spl_kmem_cache_t *skc,
1479 kmem_cbrc_t (move)(void *, void *, size_t, void *))
1480 {
1481 ASSERT(move != NULL);
1482 }
1483 EXPORT_SYMBOL(spl_kmem_cache_set_move);
1484
1485 /*
1486 * Destroy a cache and all objects associated with the cache.
1487 */
1488 void
1489 spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
1490 {
1491 DECLARE_WAIT_QUEUE_HEAD(wq);
1492 taskqid_t id;
1493
1494 ASSERT(skc->skc_magic == SKC_MAGIC);
1495 ASSERT(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB));
1496
1497 down_write(&spl_kmem_cache_sem);
1498 list_del_init(&skc->skc_list);
1499 up_write(&spl_kmem_cache_sem);
1500
1501 /* Cancel any and wait for any pending delayed tasks */
1502 VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1503
1504 spin_lock(&skc->skc_lock);
1505 id = skc->skc_taskqid;
1506 spin_unlock(&skc->skc_lock);
1507
1508 taskq_cancel_id(spl_kmem_cache_taskq, id);
1509
1510 /* Wait until all current callers complete, this is mainly
1511 * to catch the case where a low memory situation triggers a
1512 * cache reaping action which races with this destroy. */
1513 wait_event(wq, atomic_read(&skc->skc_ref) == 0);
1514
1515 if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
1516 spl_magazine_destroy(skc);
1517 spl_slab_reclaim(skc, 0, 1);
1518 } else {
1519 ASSERT(skc->skc_flags & KMC_SLAB);
1520 kmem_cache_destroy(skc->skc_linux_cache);
1521 }
1522
1523 spin_lock(&skc->skc_lock);
1524
1525 /* Validate there are no objects in use and free all the
1526 * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */
1527 ASSERT3U(skc->skc_slab_alloc, ==, 0);
1528 ASSERT3U(skc->skc_obj_alloc, ==, 0);
1529 ASSERT3U(skc->skc_slab_total, ==, 0);
1530 ASSERT3U(skc->skc_obj_total, ==, 0);
1531 ASSERT3U(skc->skc_obj_emergency, ==, 0);
1532 ASSERT(list_empty(&skc->skc_complete_list));
1533
1534 kmem_free(skc->skc_name, skc->skc_name_size);
1535 spin_unlock(&skc->skc_lock);
1536
1537 kmem_free(skc, sizeof(*skc));
1538 }
1539 EXPORT_SYMBOL(spl_kmem_cache_destroy);
1540
1541 /*
1542 * Allocate an object from a slab attached to the cache. This is used to
1543 * repopulate the per-cpu magazine caches in batches when they run low.
1544 */
1545 static void *
1546 spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
1547 {
1548 spl_kmem_obj_t *sko;
1549
1550 ASSERT(skc->skc_magic == SKC_MAGIC);
1551 ASSERT(sks->sks_magic == SKS_MAGIC);
1552 ASSERT(spin_is_locked(&skc->skc_lock));
1553
1554 sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
1555 ASSERT(sko->sko_magic == SKO_MAGIC);
1556 ASSERT(sko->sko_addr != NULL);
1557
1558 /* Remove from sks_free_list */
1559 list_del_init(&sko->sko_list);
1560
1561 sks->sks_age = jiffies;
1562 sks->sks_ref++;
1563 skc->skc_obj_alloc++;
1564
1565 /* Track max obj usage statistics */
1566 if (skc->skc_obj_alloc > skc->skc_obj_max)
1567 skc->skc_obj_max = skc->skc_obj_alloc;
1568
1569 /* Track max slab usage statistics */
1570 if (sks->sks_ref == 1) {
1571 skc->skc_slab_alloc++;
1572
1573 if (skc->skc_slab_alloc > skc->skc_slab_max)
1574 skc->skc_slab_max = skc->skc_slab_alloc;
1575 }
1576
1577 return sko->sko_addr;
1578 }
1579
1580 /*
1581 * Generic slab allocation function to run by the global work queues.
1582 * It is responsible for allocating a new slab, linking it in to the list
1583 * of partial slabs, and then waking any waiters.
1584 */
1585 static void
1586 spl_cache_grow_work(void *data)
1587 {
1588 spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
1589 spl_kmem_cache_t *skc = ska->ska_cache;
1590 spl_kmem_slab_t *sks;
1591
1592 sks = spl_slab_alloc(skc, ska->ska_flags | __GFP_NORETRY | KM_NODEBUG);
1593 spin_lock(&skc->skc_lock);
1594 if (sks) {
1595 skc->skc_slab_total++;
1596 skc->skc_obj_total += sks->sks_objs;
1597 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1598 }
1599
1600 atomic_dec(&skc->skc_ref);
1601 clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
1602 clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
1603 wake_up_all(&skc->skc_waitq);
1604 spin_unlock(&skc->skc_lock);
1605
1606 kfree(ska);
1607 }
1608
1609 /*
1610 * Returns non-zero when a new slab should be available.
1611 */
1612 static int
1613 spl_cache_grow_wait(spl_kmem_cache_t *skc)
1614 {
1615 return !test_bit(KMC_BIT_GROWING, &skc->skc_flags);
1616 }
1617
1618 /*
1619 * No available objects on any slabs, create a new slab. Note that this
1620 * functionality is disabled for KMC_SLAB caches which are backed by the
1621 * Linux slab.
1622 */
1623 static int
1624 spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
1625 {
1626 int remaining, rc;
1627
1628 ASSERT(skc->skc_magic == SKC_MAGIC);
1629 ASSERT((skc->skc_flags & KMC_SLAB) == 0);
1630 might_sleep();
1631 *obj = NULL;
1632
1633 /*
1634 * Before allocating a new slab wait for any reaping to complete and
1635 * then return so the local magazine can be rechecked for new objects.
1636 */
1637 if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
1638 rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,
1639 TASK_UNINTERRUPTIBLE);
1640 return (rc ? rc : -EAGAIN);
1641 }
1642
1643 /*
1644 * This is handled by dispatching a work request to the global work
1645 * queue. This allows us to asynchronously allocate a new slab while
1646 * retaining the ability to safely fall back to a smaller synchronous
1647 * allocations to ensure forward progress is always maintained.
1648 */
1649 if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
1650 spl_kmem_alloc_t *ska;
1651
1652 ska = kmalloc(sizeof(*ska), flags);
1653 if (ska == NULL) {
1654 clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
1655 wake_up_all(&skc->skc_waitq);
1656 return (-ENOMEM);
1657 }
1658
1659 atomic_inc(&skc->skc_ref);
1660 ska->ska_cache = skc;
1661 ska->ska_flags = flags & ~__GFP_FS;
1662 taskq_init_ent(&ska->ska_tqe);
1663 taskq_dispatch_ent(spl_kmem_cache_taskq,
1664 spl_cache_grow_work, ska, 0, &ska->ska_tqe);
1665 }
1666
1667 /*
1668 * The goal here is to only detect the rare case where a virtual slab
1669 * allocation has deadlocked. We must be careful to minimize the use
1670 * of emergency objects which are more expensive to track. Therefore,
1671 * we set a very long timeout for the asynchronous allocation and if
1672 * the timeout is reached the cache is flagged as deadlocked. From
1673 * this point only new emergency objects will be allocated until the
1674 * asynchronous allocation completes and clears the deadlocked flag.
1675 */
1676 if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {
1677 rc = spl_emergency_alloc(skc, flags, obj);
1678 } else {
1679 remaining = wait_event_timeout(skc->skc_waitq,
1680 spl_cache_grow_wait(skc), HZ);
1681
1682 if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) {
1683 spin_lock(&skc->skc_lock);
1684 if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
1685 set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
1686 skc->skc_obj_deadlock++;
1687 }
1688 spin_unlock(&skc->skc_lock);
1689 }
1690
1691 rc = -ENOMEM;
1692 }
1693
1694 return (rc);
1695 }
1696
1697 /*
1698 * Refill a per-cpu magazine with objects from the slabs for this cache.
1699 * Ideally the magazine can be repopulated using existing objects which have
1700 * been released, however if we are unable to locate enough free objects new
1701 * slabs of objects will be created. On success NULL is returned, otherwise
1702 * the address of a single emergency object is returned for use by the caller.
1703 */
1704 static void *
1705 spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
1706 {
1707 spl_kmem_slab_t *sks;
1708 int count = 0, rc, refill;
1709 void *obj = NULL;
1710
1711 ASSERT(skc->skc_magic == SKC_MAGIC);
1712 ASSERT(skm->skm_magic == SKM_MAGIC);
1713
1714 refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
1715 spin_lock(&skc->skc_lock);
1716
1717 while (refill > 0) {
1718 /* No slabs available we may need to grow the cache */
1719 if (list_empty(&skc->skc_partial_list)) {
1720 spin_unlock(&skc->skc_lock);
1721
1722 local_irq_enable();
1723 rc = spl_cache_grow(skc, flags, &obj);
1724 local_irq_disable();
1725
1726 /* Emergency object for immediate use by caller */
1727 if (rc == 0 && obj != NULL)
1728 return (obj);
1729
1730 if (rc)
1731 goto out;
1732
1733 /* Rescheduled to different CPU skm is not local */
1734 if (skm != skc->skc_mag[smp_processor_id()])
1735 goto out;
1736
1737 /* Potentially rescheduled to the same CPU but
1738 * allocations may have occurred from this CPU while
1739 * we were sleeping so recalculate max refill. */
1740 refill = MIN(refill, skm->skm_size - skm->skm_avail);
1741
1742 spin_lock(&skc->skc_lock);
1743 continue;
1744 }
1745
1746 /* Grab the next available slab */
1747 sks = list_entry((&skc->skc_partial_list)->next,
1748 spl_kmem_slab_t, sks_list);
1749 ASSERT(sks->sks_magic == SKS_MAGIC);
1750 ASSERT(sks->sks_ref < sks->sks_objs);
1751 ASSERT(!list_empty(&sks->sks_free_list));
1752
1753 /* Consume as many objects as needed to refill the requested
1754 * cache. We must also be careful not to overfill it. */
1755 while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++count) {
1756 ASSERT(skm->skm_avail < skm->skm_size);
1757 ASSERT(count < skm->skm_size);
1758 skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
1759 }
1760
1761 /* Move slab to skc_complete_list when full */
1762 if (sks->sks_ref == sks->sks_objs) {
1763 list_del(&sks->sks_list);
1764 list_add(&sks->sks_list, &skc->skc_complete_list);
1765 }
1766 }
1767
1768 spin_unlock(&skc->skc_lock);
1769 out:
1770 return (NULL);
1771 }
1772
1773 /*
1774 * Release an object back to the slab from which it came.
1775 */
1776 static void
1777 spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
1778 {
1779 spl_kmem_slab_t *sks = NULL;
1780 spl_kmem_obj_t *sko = NULL;
1781
1782 ASSERT(skc->skc_magic == SKC_MAGIC);
1783 ASSERT(spin_is_locked(&skc->skc_lock));
1784
1785 sko = spl_sko_from_obj(skc, obj);
1786 ASSERT(sko->sko_magic == SKO_MAGIC);
1787 sks = sko->sko_slab;
1788 ASSERT(sks->sks_magic == SKS_MAGIC);
1789 ASSERT(sks->sks_cache == skc);
1790 list_add(&sko->sko_list, &sks->sks_free_list);
1791
1792 sks->sks_age = jiffies;
1793 sks->sks_ref--;
1794 skc->skc_obj_alloc--;
1795
1796 /* Move slab to skc_partial_list when no longer full. Slabs
1797 * are added to the head to keep the partial list is quasi-full
1798 * sorted order. Fuller at the head, emptier at the tail. */
1799 if (sks->sks_ref == (sks->sks_objs - 1)) {
1800 list_del(&sks->sks_list);
1801 list_add(&sks->sks_list, &skc->skc_partial_list);
1802 }
1803
1804 /* Move empty slabs to the end of the partial list so
1805 * they can be easily found and freed during reclamation. */
1806 if (sks->sks_ref == 0) {
1807 list_del(&sks->sks_list);
1808 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1809 skc->skc_slab_alloc--;
1810 }
1811 }
1812
1813 /*
1814 * Allocate an object from the per-cpu magazine, or if the magazine
1815 * is empty directly allocate from a slab and repopulate the magazine.
1816 */
1817 void *
1818 spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
1819 {
1820 spl_kmem_magazine_t *skm;
1821 void *obj = NULL;
1822
1823 ASSERT(skc->skc_magic == SKC_MAGIC);
1824 ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1825 ASSERT(flags & KM_SLEEP);
1826
1827 atomic_inc(&skc->skc_ref);
1828
1829 /*
1830 * Allocate directly from a Linux slab. All optimizations are left
1831 * to the underlying cache we only need to guarantee that KM_SLEEP
1832 * callers will never fail.
1833 */
1834 if (skc->skc_flags & KMC_SLAB) {
1835 struct kmem_cache *slc = skc->skc_linux_cache;
1836
1837 do {
1838 obj = kmem_cache_alloc(slc, flags | __GFP_COMP);
1839 } while ((obj == NULL) && !(flags & KM_NOSLEEP));
1840
1841 goto ret;
1842 }
1843
1844 local_irq_disable();
1845
1846 restart:
1847 /* Safe to update per-cpu structure without lock, but
1848 * in the restart case we must be careful to reacquire
1849 * the local magazine since this may have changed
1850 * when we need to grow the cache. */
1851 skm = skc->skc_mag[smp_processor_id()];
1852 ASSERT(skm->skm_magic == SKM_MAGIC);
1853
1854 if (likely(skm->skm_avail)) {
1855 /* Object available in CPU cache, use it */
1856 obj = skm->skm_objs[--skm->skm_avail];
1857 skm->skm_age = jiffies;
1858 } else {
1859 obj = spl_cache_refill(skc, skm, flags);
1860 if (obj == NULL)
1861 goto restart;
1862 }
1863
1864 local_irq_enable();
1865 ASSERT(obj);
1866 ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
1867
1868 ret:
1869 /* Pre-emptively migrate object to CPU L1 cache */
1870 if (obj) {
1871 if (obj && skc->skc_ctor)
1872 skc->skc_ctor(obj, skc->skc_private, flags);
1873 else
1874 prefetchw(obj);
1875 }
1876
1877 atomic_dec(&skc->skc_ref);
1878
1879 return (obj);
1880 }
1881
1882 EXPORT_SYMBOL(spl_kmem_cache_alloc);
1883
1884 /*
1885 * Free an object back to the local per-cpu magazine, there is no
1886 * guarantee that this is the same magazine the object was originally
1887 * allocated from. We may need to flush entire from the magazine
1888 * back to the slabs to make space.
1889 */
1890 void
1891 spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
1892 {
1893 spl_kmem_magazine_t *skm;
1894 unsigned long flags;
1895
1896 ASSERT(skc->skc_magic == SKC_MAGIC);
1897 ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1898 atomic_inc(&skc->skc_ref);
1899
1900 /*
1901 * Run the destructor
1902 */
1903 if (skc->skc_dtor)
1904 skc->skc_dtor(obj, skc->skc_private);
1905
1906 /*
1907 * Free the object from the Linux underlying Linux slab.
1908 */
1909 if (skc->skc_flags & KMC_SLAB) {
1910 kmem_cache_free(skc->skc_linux_cache, obj);
1911 goto out;
1912 }
1913
1914 /*
1915 * Only virtual slabs may have emergency objects and these objects
1916 * are guaranteed to have physical addresses. They must be removed
1917 * from the tree of emergency objects and the freed.
1918 */
1919 if ((skc->skc_flags & KMC_VMEM) && !kmem_virt(obj)) {
1920 spl_emergency_free(skc, obj);
1921 goto out;
1922 }
1923
1924 local_irq_save(flags);
1925
1926 /* Safe to update per-cpu structure without lock, but
1927 * no remote memory allocation tracking is being performed
1928 * it is entirely possible to allocate an object from one
1929 * CPU cache and return it to another. */
1930 skm = skc->skc_mag[smp_processor_id()];
1931 ASSERT(skm->skm_magic == SKM_MAGIC);
1932
1933 /* Per-CPU cache full, flush it to make space */
1934 if (unlikely(skm->skm_avail >= skm->skm_size))
1935 spl_cache_flush(skc, skm, skm->skm_refill);
1936
1937 /* Available space in cache, use it */
1938 skm->skm_objs[skm->skm_avail++] = obj;
1939
1940 local_irq_restore(flags);
1941 out:
1942 atomic_dec(&skc->skc_ref);
1943 }
1944 EXPORT_SYMBOL(spl_kmem_cache_free);
1945
1946 /*
1947 * The generic shrinker function for all caches. Under Linux a shrinker
1948 * may not be tightly coupled with a slab cache. In fact Linux always
1949 * systematically tries calling all registered shrinker callbacks which
1950 * report that they contain unused objects. Because of this we only
1951 * register one shrinker function in the shim layer for all slab caches.
1952 * We always attempt to shrink all caches when this generic shrinker
1953 * is called.
1954 *
1955 * If sc->nr_to_scan is zero, the caller is requesting a query of the
1956 * number of objects which can potentially be freed. If it is nonzero,
1957 * the request is to free that many objects.
1958 *
1959 * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks
1960 * in struct shrinker and also require the shrinker to return the number
1961 * of objects freed.
1962 *
1963 * Older kernels require the shrinker to return the number of freeable
1964 * objects following the freeing of nr_to_free.
1965 *
1966 * Linux semantics differ from those under Solaris, which are to
1967 * free all available objects which may (and probably will) be more
1968 * objects than the requested nr_to_scan.
1969 */
1970 static spl_shrinker_t
1971 __spl_kmem_cache_generic_shrinker(struct shrinker *shrink,
1972 struct shrink_control *sc)
1973 {
1974 spl_kmem_cache_t *skc;
1975 int alloc = 0;
1976
1977 down_read(&spl_kmem_cache_sem);
1978 list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
1979 if (sc->nr_to_scan) {
1980 #ifdef HAVE_SPLIT_SHRINKER_CALLBACK
1981 uint64_t oldalloc = skc->skc_obj_alloc;
1982 spl_kmem_cache_reap_now(skc,
1983 MAX(sc->nr_to_scan >> fls64(skc->skc_slab_objs), 1));
1984 if (oldalloc > skc->skc_obj_alloc)
1985 alloc += oldalloc - skc->skc_obj_alloc;
1986 #else
1987 spl_kmem_cache_reap_now(skc,
1988 MAX(sc->nr_to_scan >> fls64(skc->skc_slab_objs), 1));
1989 alloc += skc->skc_obj_alloc;
1990 #endif /* HAVE_SPLIT_SHRINKER_CALLBACK */
1991 } else {
1992 /* Request to query number of freeable objects */
1993 alloc += skc->skc_obj_alloc;
1994 }
1995 }
1996 up_read(&spl_kmem_cache_sem);
1997
1998 /*
1999 * When KMC_RECLAIM_ONCE is set allow only a single reclaim pass.
2000 * This functionality only exists to work around a rare issue where
2001 * shrink_slabs() is repeatedly invoked by many cores causing the
2002 * system to thrash.
2003 */
2004 if ((spl_kmem_cache_reclaim & KMC_RECLAIM_ONCE) && sc->nr_to_scan)
2005 return (SHRINK_STOP);
2006
2007 return (MAX(alloc, 0));
2008 }
2009
2010 SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker);
2011
2012 /*
2013 * Call the registered reclaim function for a cache. Depending on how
2014 * many and which objects are released it may simply repopulate the
2015 * local magazine which will then need to age-out. Objects which cannot
2016 * fit in the magazine we will be released back to their slabs which will
2017 * also need to age out before being release. This is all just best
2018 * effort and we do not want to thrash creating and destroying slabs.
2019 */
2020 void
2021 spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count)
2022 {
2023 ASSERT(skc->skc_magic == SKC_MAGIC);
2024 ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
2025
2026 atomic_inc(&skc->skc_ref);
2027
2028 /*
2029 * Execute the registered reclaim callback if it exists. The
2030 * per-cpu caches will be drained when is set KMC_EXPIRE_MEM.
2031 */
2032 if (skc->skc_flags & KMC_SLAB) {
2033 if (skc->skc_reclaim)
2034 skc->skc_reclaim(skc->skc_private);
2035
2036 if (spl_kmem_cache_expire & KMC_EXPIRE_MEM)
2037 kmem_cache_shrink(skc->skc_linux_cache);
2038
2039 goto out;
2040 }
2041
2042 /*
2043 * Prevent concurrent cache reaping when contended.
2044 */
2045 if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags))
2046 goto out;
2047
2048 /*
2049 * When a reclaim function is available it may be invoked repeatedly
2050 * until at least a single slab can be freed. This ensures that we
2051 * do free memory back to the system. This helps minimize the chance
2052 * of an OOM event when the bulk of memory is used by the slab.
2053 *
2054 * When free slabs are already available the reclaim callback will be
2055 * skipped. Additionally, if no forward progress is detected despite
2056 * a reclaim function the cache will be skipped to avoid deadlock.
2057 *
2058 * Longer term this would be the correct place to add the code which
2059 * repacks the slabs in order minimize fragmentation.
2060 */
2061 if (skc->skc_reclaim) {
2062 uint64_t objects = UINT64_MAX;
2063 int do_reclaim;
2064
2065 do {
2066 spin_lock(&skc->skc_lock);
2067 do_reclaim =
2068 (skc->skc_slab_total > 0) &&
2069 ((skc->skc_slab_total - skc->skc_slab_alloc) == 0) &&
2070 (skc->skc_obj_alloc < objects);
2071
2072 objects = skc->skc_obj_alloc;
2073 spin_unlock(&skc->skc_lock);
2074
2075 if (do_reclaim)
2076 skc->skc_reclaim(skc->skc_private);
2077
2078 } while (do_reclaim);
2079 }
2080
2081 /* Reclaim from the magazine then the slabs ignoring age and delay. */
2082 if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) {
2083 spl_kmem_magazine_t *skm;
2084 unsigned long irq_flags;
2085
2086 local_irq_save(irq_flags);
2087 skm = skc->skc_mag[smp_processor_id()];
2088 spl_cache_flush(skc, skm, skm->skm_avail);
2089 local_irq_restore(irq_flags);
2090 }
2091
2092 spl_slab_reclaim(skc, count, 1);
2093 clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
2094 smp_wmb();
2095 wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
2096 out:
2097 atomic_dec(&skc->skc_ref);
2098 }
2099 EXPORT_SYMBOL(spl_kmem_cache_reap_now);
2100
2101 /*
2102 * Reap all free slabs from all registered caches.
2103 */
2104 void
2105 spl_kmem_reap(void)
2106 {
2107 struct shrink_control sc;
2108
2109 sc.nr_to_scan = KMC_REAP_CHUNK;
2110 sc.gfp_mask = GFP_KERNEL;
2111
2112 (void) __spl_kmem_cache_generic_shrinker(NULL, &sc);
2113 }
2114 EXPORT_SYMBOL(spl_kmem_reap);
2115
2116 #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
2117 static char *
2118 spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
2119 {
2120 int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
2121 int i, flag = 1;
2122
2123 ASSERT(str != NULL && len >= 17);
2124 memset(str, 0, len);
2125
2126 /* Check for a fully printable string, and while we are at
2127 * it place the printable characters in the passed buffer. */
2128 for (i = 0; i < size; i++) {
2129 str[i] = ((char *)(kd->kd_addr))[i];
2130 if (isprint(str[i])) {
2131 continue;
2132 } else {
2133 /* Minimum number of printable characters found
2134 * to make it worthwhile to print this as ascii. */
2135 if (i > min)
2136 break;
2137
2138 flag = 0;
2139 break;
2140 }
2141 }
2142
2143 if (!flag) {
2144 sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
2145 *((uint8_t *)kd->kd_addr),
2146 *((uint8_t *)kd->kd_addr + 2),
2147 *((uint8_t *)kd->kd_addr + 4),
2148 *((uint8_t *)kd->kd_addr + 6),
2149 *((uint8_t *)kd->kd_addr + 8),
2150 *((uint8_t *)kd->kd_addr + 10),
2151 *((uint8_t *)kd->kd_addr + 12),
2152 *((uint8_t *)kd->kd_addr + 14));
2153 }
2154
2155 return str;
2156 }
2157
2158 static int
2159 spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
2160 {
2161 int i;
2162
2163 spin_lock_init(lock);
2164 INIT_LIST_HEAD(list);
2165
2166 for (i = 0; i < size; i++)
2167 INIT_HLIST_HEAD(&kmem_table[i]);
2168
2169 return (0);
2170 }
2171
2172 static void
2173 spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
2174 {
2175 unsigned long flags;
2176 kmem_debug_t *kd;
2177 char str[17];
2178
2179 spin_lock_irqsave(lock, flags);
2180 if (!list_empty(list))
2181 printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
2182 "size", "data", "func", "line");
2183
2184 list_for_each_entry(kd, list, kd_list)
2185 printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
2186 (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
2187 kd->kd_func, kd->kd_line);
2188
2189 spin_unlock_irqrestore(lock, flags);
2190 }
2191 #else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
2192 #define spl_kmem_init_tracking(list, lock, size)
2193 #define spl_kmem_fini_tracking(list, lock)
2194 #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
2195
2196 int
2197 spl_kmem_init(void)
2198 {
2199 int rc = 0;
2200
2201 #ifdef DEBUG_KMEM
2202 kmem_alloc_used_set(0);
2203 vmem_alloc_used_set(0);
2204
2205 spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
2206 spl_kmem_init_tracking(&vmem_list, &vmem_lock, VMEM_TABLE_SIZE);
2207 #endif
2208
2209 init_rwsem(&spl_kmem_cache_sem);
2210 INIT_LIST_HEAD(&spl_kmem_cache_list);
2211 spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
2212 1, maxclsyspri, 1, 32, TASKQ_PREPOPULATE);
2213
2214 spl_register_shrinker(&spl_kmem_cache_shrinker);
2215
2216 return (rc);
2217 }
2218
2219 void
2220 spl_kmem_fini(void)
2221 {
2222 spl_unregister_shrinker(&spl_kmem_cache_shrinker);
2223 taskq_destroy(spl_kmem_cache_taskq);
2224
2225 #ifdef DEBUG_KMEM
2226 /* Display all unreclaimed memory addresses, including the
2227 * allocation size and the first few bytes of what's located
2228 * at that address to aid in debugging. Performance is not
2229 * a serious concern here since it is module unload time. */
2230 if (kmem_alloc_used_read() != 0)
2231 printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n",
2232 kmem_alloc_used_read(), kmem_alloc_max);
2233
2234 if (vmem_alloc_used_read() != 0)
2235 printk(KERN_WARNING "vmem leaked %ld/%llu bytes\n",
2236 vmem_alloc_used_read(), vmem_alloc_max);
2237
2238 spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
2239 spl_kmem_fini_tracking(&vmem_list, &vmem_lock);
2240 #endif /* DEBUG_KMEM */
2241 }