2 * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
3 * Copyright (C) 2007 The Regents of the University of California.
4 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
5 * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
8 * This file is part of the SPL, Solaris Porting Layer.
9 * For details, see <http://zfsonlinux.org/>.
11 * The SPL is free software; you can redistribute it and/or modify it
12 * under the terms of the GNU General Public License as published by the
13 * Free Software Foundation; either version 2 of the License, or (at your
14 * option) any later version.
16 * The SPL is distributed in the hope that it will be useful, but WITHOUT
17 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
18 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
21 * You should have received a copy of the GNU General Public License along
22 * with the SPL. If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/kmem_cache.h>
27 #include <sys/taskq.h>
28 #include <sys/timer.h>
30 #include <linux/slab.h>
31 #include <linux/swap.h>
32 #include <linux/mm_compat.h>
33 #include <linux/wait_compat.h>
36 * Within the scope of spl-kmem.c file the kmem_cache_* definitions
37 * are removed to allow access to the real Linux slab allocator.
39 #undef kmem_cache_destroy
40 #undef kmem_cache_create
41 #undef kmem_cache_alloc
42 #undef kmem_cache_free
46 * Cache expiration was implemented because it was part of the default Solaris
47 * kmem_cache behavior. The idea is that per-cpu objects which haven't been
48 * accessed in several seconds should be returned to the cache. On the other
49 * hand Linux slabs never move objects back to the slabs unless there is
50 * memory pressure on the system. By default the Linux method is enabled
51 * because it has been shown to improve responsiveness on low memory systems.
52 * This policy may be changed by setting KMC_EXPIRE_AGE or KMC_EXPIRE_MEM.
54 unsigned int spl_kmem_cache_expire
= KMC_EXPIRE_MEM
;
55 EXPORT_SYMBOL(spl_kmem_cache_expire
);
56 module_param(spl_kmem_cache_expire
, uint
, 0644);
57 MODULE_PARM_DESC(spl_kmem_cache_expire
, "By age (0x1) or low memory (0x2)");
60 * The default behavior is to report the number of objects remaining in the
61 * cache. This allows the Linux VM to repeatedly reclaim objects from the
62 * cache when memory is low satisfy other memory allocations. Alternately,
63 * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache
64 * is reclaimed. This may increase the likelihood of out of memory events.
66 unsigned int spl_kmem_cache_reclaim
= 0 /* KMC_RECLAIM_ONCE */;
67 module_param(spl_kmem_cache_reclaim
, uint
, 0644);
68 MODULE_PARM_DESC(spl_kmem_cache_reclaim
, "Single reclaim pass (0x1)");
70 unsigned int spl_kmem_cache_obj_per_slab
= SPL_KMEM_CACHE_OBJ_PER_SLAB
;
71 module_param(spl_kmem_cache_obj_per_slab
, uint
, 0644);
72 MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab
, "Number of objects per slab");
74 unsigned int spl_kmem_cache_obj_per_slab_min
= SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN
;
75 module_param(spl_kmem_cache_obj_per_slab_min
, uint
, 0644);
76 MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min
,
77 "Minimal number of objects per slab");
79 unsigned int spl_kmem_cache_max_size
= 32;
80 module_param(spl_kmem_cache_max_size
, uint
, 0644);
81 MODULE_PARM_DESC(spl_kmem_cache_max_size
, "Maximum size of slab in MB");
84 * For small objects the Linux slab allocator should be used to make the most
85 * efficient use of the memory. However, large objects are not supported by
86 * the Linux slab and therefore the SPL implementation is preferred. A cutoff
87 * of 16K was determined to be optimal for architectures using 4K pages.
90 unsigned int spl_kmem_cache_slab_limit
= 16384;
92 unsigned int spl_kmem_cache_slab_limit
= 0;
94 module_param(spl_kmem_cache_slab_limit
, uint
, 0644);
95 MODULE_PARM_DESC(spl_kmem_cache_slab_limit
,
96 "Objects less than N bytes use the Linux slab");
98 unsigned int spl_kmem_cache_kmem_limit
= (PAGE_SIZE
/ 4);
99 module_param(spl_kmem_cache_kmem_limit
, uint
, 0644);
100 MODULE_PARM_DESC(spl_kmem_cache_kmem_limit
,
101 "Objects less than N bytes use the kmalloc");
104 * Slab allocation interfaces
106 * While the Linux slab implementation was inspired by the Solaris
107 * implementation I cannot use it to emulate the Solaris APIs. I
108 * require two features which are not provided by the Linux slab.
110 * 1) Constructors AND destructors. Recent versions of the Linux
111 * kernel have removed support for destructors. This is a deal
112 * breaker for the SPL which contains particularly expensive
113 * initializers for mutex's, condition variables, etc. We also
114 * require a minimal level of cleanup for these data types unlike
115 * many Linux data types which do need to be explicitly destroyed.
117 * 2) Virtual address space backed slab. Callers of the Solaris slab
118 * expect it to work well for both small are very large allocations.
119 * Because of memory fragmentation the Linux slab which is backed
120 * by kmalloc'ed memory performs very badly when confronted with
121 * large numbers of large allocations. Basing the slab on the
122 * virtual address space removes the need for contiguous pages
123 * and greatly improve performance for large allocations.
125 * For these reasons, the SPL has its own slab implementation with
126 * the needed features. It is not as highly optimized as either the
127 * Solaris or Linux slabs, but it should get me most of what is
128 * needed until it can be optimized or obsoleted by another approach.
130 * One serious concern I do have about this method is the relatively
131 * small virtual address space on 32bit arches. This will seriously
132 * constrain the size of the slab caches and their performance.
135 struct list_head spl_kmem_cache_list
; /* List of caches */
136 struct rw_semaphore spl_kmem_cache_sem
; /* Cache list lock */
137 taskq_t
*spl_kmem_cache_taskq
; /* Task queue for ageing / reclaim */
139 static void spl_cache_shrink(spl_kmem_cache_t
*skc
, void *obj
);
141 SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker
);
142 SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker
,
143 spl_kmem_cache_generic_shrinker
, KMC_DEFAULT_SEEKS
);
146 kv_alloc(spl_kmem_cache_t
*skc
, int size
, int flags
)
148 gfp_t lflags
= kmem_flags_convert(flags
);
153 if (skc
->skc_flags
& KMC_KMEM
)
154 ptr
= (void *)__get_free_pages(lflags
, get_order(size
));
156 ptr
= __vmalloc(size
, lflags
| __GFP_HIGHMEM
, PAGE_KERNEL
);
158 /* Resulting allocated memory will be page aligned */
159 ASSERT(IS_P2ALIGNED(ptr
, PAGE_SIZE
));
165 kv_free(spl_kmem_cache_t
*skc
, void *ptr
, int size
)
167 ASSERT(IS_P2ALIGNED(ptr
, PAGE_SIZE
));
171 * The Linux direct reclaim path uses this out of band value to
172 * determine if forward progress is being made. Normally this is
173 * incremented by kmem_freepages() which is part of the various
174 * Linux slab implementations. However, since we are using none
175 * of that infrastructure we are responsible for incrementing it.
177 if (current
->reclaim_state
)
178 current
->reclaim_state
->reclaimed_slab
+= size
>> PAGE_SHIFT
;
180 if (skc
->skc_flags
& KMC_KMEM
)
181 free_pages((unsigned long)ptr
, get_order(size
));
187 * Required space for each aligned sks.
189 static inline uint32_t
190 spl_sks_size(spl_kmem_cache_t
*skc
)
192 return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t
),
193 skc
->skc_obj_align
, uint32_t));
197 * Required space for each aligned object.
199 static inline uint32_t
200 spl_obj_size(spl_kmem_cache_t
*skc
)
202 uint32_t align
= skc
->skc_obj_align
;
204 return (P2ROUNDUP_TYPED(skc
->skc_obj_size
, align
, uint32_t) +
205 P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t
), align
, uint32_t));
209 * Lookup the spl_kmem_object_t for an object given that object.
211 static inline spl_kmem_obj_t
*
212 spl_sko_from_obj(spl_kmem_cache_t
*skc
, void *obj
)
214 return (obj
+ P2ROUNDUP_TYPED(skc
->skc_obj_size
,
215 skc
->skc_obj_align
, uint32_t));
219 * Required space for each offslab object taking in to account alignment
220 * restrictions and the power-of-two requirement of kv_alloc().
222 static inline uint32_t
223 spl_offslab_size(spl_kmem_cache_t
*skc
)
225 return (1UL << (fls64(spl_obj_size(skc
)) + 1));
229 * It's important that we pack the spl_kmem_obj_t structure and the
230 * actual objects in to one large address space to minimize the number
231 * of calls to the allocator. It is far better to do a few large
232 * allocations and then subdivide it ourselves. Now which allocator
233 * we use requires balancing a few trade offs.
235 * For small objects we use kmem_alloc() because as long as you are
236 * only requesting a small number of pages (ideally just one) its cheap.
237 * However, when you start requesting multiple pages with kmem_alloc()
238 * it gets increasingly expensive since it requires contiguous pages.
239 * For this reason we shift to vmem_alloc() for slabs of large objects
240 * which removes the need for contiguous pages. We do not use
241 * vmem_alloc() in all cases because there is significant locking
242 * overhead in __get_vm_area_node(). This function takes a single
243 * global lock when acquiring an available virtual address range which
244 * serializes all vmem_alloc()'s for all slab caches. Using slightly
245 * different allocation functions for small and large objects should
246 * give us the best of both worlds.
248 * KMC_ONSLAB KMC_OFFSLAB
250 * +------------------------+ +-----------------+
251 * | spl_kmem_slab_t --+-+ | | spl_kmem_slab_t |---+-+
252 * | skc_obj_size <-+ | | +-----------------+ | |
253 * | spl_kmem_obj_t | | | |
254 * | skc_obj_size <---+ | +-----------------+ | |
255 * | spl_kmem_obj_t | | | skc_obj_size | <-+ |
256 * | ... v | | spl_kmem_obj_t | |
257 * +------------------------+ +-----------------+ v
259 static spl_kmem_slab_t
*
260 spl_slab_alloc(spl_kmem_cache_t
*skc
, int flags
)
262 spl_kmem_slab_t
*sks
;
263 spl_kmem_obj_t
*sko
, *n
;
265 uint32_t obj_size
, offslab_size
= 0;
268 base
= kv_alloc(skc
, skc
->skc_slab_size
, flags
);
272 sks
= (spl_kmem_slab_t
*)base
;
273 sks
->sks_magic
= SKS_MAGIC
;
274 sks
->sks_objs
= skc
->skc_slab_objs
;
275 sks
->sks_age
= jiffies
;
276 sks
->sks_cache
= skc
;
277 INIT_LIST_HEAD(&sks
->sks_list
);
278 INIT_LIST_HEAD(&sks
->sks_free_list
);
280 obj_size
= spl_obj_size(skc
);
282 if (skc
->skc_flags
& KMC_OFFSLAB
)
283 offslab_size
= spl_offslab_size(skc
);
285 for (i
= 0; i
< sks
->sks_objs
; i
++) {
286 if (skc
->skc_flags
& KMC_OFFSLAB
) {
287 obj
= kv_alloc(skc
, offslab_size
, flags
);
293 obj
= base
+ spl_sks_size(skc
) + (i
* obj_size
);
296 ASSERT(IS_P2ALIGNED(obj
, skc
->skc_obj_align
));
297 sko
= spl_sko_from_obj(skc
, obj
);
299 sko
->sko_magic
= SKO_MAGIC
;
301 INIT_LIST_HEAD(&sko
->sko_list
);
302 list_add_tail(&sko
->sko_list
, &sks
->sks_free_list
);
307 if (skc
->skc_flags
& KMC_OFFSLAB
)
308 list_for_each_entry_safe(sko
,
309 n
, &sks
->sks_free_list
, sko_list
)
310 kv_free(skc
, sko
->sko_addr
, offslab_size
);
312 kv_free(skc
, base
, skc
->skc_slab_size
);
320 * Remove a slab from complete or partial list, it must be called with
321 * the 'skc->skc_lock' held but the actual free must be performed
322 * outside the lock to prevent deadlocking on vmem addresses.
325 spl_slab_free(spl_kmem_slab_t
*sks
,
326 struct list_head
*sks_list
, struct list_head
*sko_list
)
328 spl_kmem_cache_t
*skc
;
330 ASSERT(sks
->sks_magic
== SKS_MAGIC
);
331 ASSERT(sks
->sks_ref
== 0);
333 skc
= sks
->sks_cache
;
334 ASSERT(skc
->skc_magic
== SKC_MAGIC
);
335 ASSERT(spin_is_locked(&skc
->skc_lock
));
338 * Update slab/objects counters in the cache, then remove the
339 * slab from the skc->skc_partial_list. Finally add the slab
340 * and all its objects in to the private work lists where the
341 * destructors will be called and the memory freed to the system.
343 skc
->skc_obj_total
-= sks
->sks_objs
;
344 skc
->skc_slab_total
--;
345 list_del(&sks
->sks_list
);
346 list_add(&sks
->sks_list
, sks_list
);
347 list_splice_init(&sks
->sks_free_list
, sko_list
);
351 * Traverse all the partial slabs attached to a cache and free those which
352 * are currently empty, and have not been touched for skc_delay seconds to
353 * avoid thrashing. The count argument is passed to optionally cap the
354 * number of slabs reclaimed, a count of zero means try and reclaim
355 * everything. When flag the is set available slabs freed regardless of age.
358 spl_slab_reclaim(spl_kmem_cache_t
*skc
, int count
, int flag
)
360 spl_kmem_slab_t
*sks
, *m
;
361 spl_kmem_obj_t
*sko
, *n
;
368 * Move empty slabs and objects which have not been touched in
369 * skc_delay seconds on to private lists to be freed outside
370 * the spin lock. This delay time is important to avoid thrashing
371 * however when flag is set the delay will not be used.
373 spin_lock(&skc
->skc_lock
);
374 list_for_each_entry_safe_reverse(sks
, m
,
375 &skc
->skc_partial_list
, sks_list
) {
377 * All empty slabs are at the end of skc->skc_partial_list,
378 * therefore once a non-empty slab is found we can stop
379 * scanning. Additionally, stop when reaching the target
380 * reclaim 'count' if a non-zero threshold is given.
382 if ((sks
->sks_ref
> 0) || (count
&& i
>= count
))
385 if (time_after(jiffies
, sks
->sks_age
+ skc
->skc_delay
* HZ
) ||
387 spl_slab_free(sks
, &sks_list
, &sko_list
);
391 spin_unlock(&skc
->skc_lock
);
394 * The following two loops ensure all the object destructors are
395 * run, any offslab objects are freed, and the slabs themselves
396 * are freed. This is all done outside the skc->skc_lock since
397 * this allows the destructor to sleep, and allows us to perform
398 * a conditional reschedule when a freeing a large number of
399 * objects and slabs back to the system.
401 if (skc
->skc_flags
& KMC_OFFSLAB
)
402 size
= spl_offslab_size(skc
);
404 list_for_each_entry_safe(sko
, n
, &sko_list
, sko_list
) {
405 ASSERT(sko
->sko_magic
== SKO_MAGIC
);
407 if (skc
->skc_flags
& KMC_OFFSLAB
)
408 kv_free(skc
, sko
->sko_addr
, size
);
411 list_for_each_entry_safe(sks
, m
, &sks_list
, sks_list
) {
412 ASSERT(sks
->sks_magic
== SKS_MAGIC
);
413 kv_free(skc
, sks
, skc
->skc_slab_size
);
417 static spl_kmem_emergency_t
*
418 spl_emergency_search(struct rb_root
*root
, void *obj
)
420 struct rb_node
*node
= root
->rb_node
;
421 spl_kmem_emergency_t
*ske
;
422 unsigned long address
= (unsigned long)obj
;
425 ske
= container_of(node
, spl_kmem_emergency_t
, ske_node
);
427 if (address
< (unsigned long)ske
->ske_obj
)
428 node
= node
->rb_left
;
429 else if (address
> (unsigned long)ske
->ske_obj
)
430 node
= node
->rb_right
;
439 spl_emergency_insert(struct rb_root
*root
, spl_kmem_emergency_t
*ske
)
441 struct rb_node
**new = &(root
->rb_node
), *parent
= NULL
;
442 spl_kmem_emergency_t
*ske_tmp
;
443 unsigned long address
= (unsigned long)ske
->ske_obj
;
446 ske_tmp
= container_of(*new, spl_kmem_emergency_t
, ske_node
);
449 if (address
< (unsigned long)ske_tmp
->ske_obj
)
450 new = &((*new)->rb_left
);
451 else if (address
> (unsigned long)ske_tmp
->ske_obj
)
452 new = &((*new)->rb_right
);
457 rb_link_node(&ske
->ske_node
, parent
, new);
458 rb_insert_color(&ske
->ske_node
, root
);
464 * Allocate a single emergency object and track it in a red black tree.
467 spl_emergency_alloc(spl_kmem_cache_t
*skc
, int flags
, void **obj
)
469 gfp_t lflags
= kmem_flags_convert(flags
);
470 spl_kmem_emergency_t
*ske
;
473 /* Last chance use a partial slab if one now exists */
474 spin_lock(&skc
->skc_lock
);
475 empty
= list_empty(&skc
->skc_partial_list
);
476 spin_unlock(&skc
->skc_lock
);
480 ske
= kmalloc(sizeof (*ske
), lflags
);
484 ske
->ske_obj
= kmalloc(skc
->skc_obj_size
, lflags
);
485 if (ske
->ske_obj
== NULL
) {
490 spin_lock(&skc
->skc_lock
);
491 empty
= spl_emergency_insert(&skc
->skc_emergency_tree
, ske
);
493 skc
->skc_obj_total
++;
494 skc
->skc_obj_emergency
++;
495 if (skc
->skc_obj_emergency
> skc
->skc_obj_emergency_max
)
496 skc
->skc_obj_emergency_max
= skc
->skc_obj_emergency
;
498 spin_unlock(&skc
->skc_lock
);
500 if (unlikely(!empty
)) {
512 * Locate the passed object in the red black tree and free it.
515 spl_emergency_free(spl_kmem_cache_t
*skc
, void *obj
)
517 spl_kmem_emergency_t
*ske
;
519 spin_lock(&skc
->skc_lock
);
520 ske
= spl_emergency_search(&skc
->skc_emergency_tree
, obj
);
522 rb_erase(&ske
->ske_node
, &skc
->skc_emergency_tree
);
523 skc
->skc_obj_emergency
--;
524 skc
->skc_obj_total
--;
526 spin_unlock(&skc
->skc_lock
);
528 if (unlikely(ske
== NULL
))
538 * Release objects from the per-cpu magazine back to their slab. The flush
539 * argument contains the max number of entries to remove from the magazine.
542 __spl_cache_flush(spl_kmem_cache_t
*skc
, spl_kmem_magazine_t
*skm
, int flush
)
544 int i
, count
= MIN(flush
, skm
->skm_avail
);
546 ASSERT(skc
->skc_magic
== SKC_MAGIC
);
547 ASSERT(skm
->skm_magic
== SKM_MAGIC
);
548 ASSERT(spin_is_locked(&skc
->skc_lock
));
550 for (i
= 0; i
< count
; i
++)
551 spl_cache_shrink(skc
, skm
->skm_objs
[i
]);
553 skm
->skm_avail
-= count
;
554 memmove(skm
->skm_objs
, &(skm
->skm_objs
[count
]),
555 sizeof (void *) * skm
->skm_avail
);
559 spl_cache_flush(spl_kmem_cache_t
*skc
, spl_kmem_magazine_t
*skm
, int flush
)
561 spin_lock(&skc
->skc_lock
);
562 __spl_cache_flush(skc
, skm
, flush
);
563 spin_unlock(&skc
->skc_lock
);
567 spl_magazine_age(void *data
)
569 spl_kmem_cache_t
*skc
= (spl_kmem_cache_t
*)data
;
570 spl_kmem_magazine_t
*skm
= skc
->skc_mag
[smp_processor_id()];
572 ASSERT(skm
->skm_magic
== SKM_MAGIC
);
573 ASSERT(skm
->skm_cpu
== smp_processor_id());
574 ASSERT(irqs_disabled());
576 /* There are no available objects or they are too young to age out */
577 if ((skm
->skm_avail
== 0) ||
578 time_before(jiffies
, skm
->skm_age
+ skc
->skc_delay
* HZ
))
582 * Because we're executing in interrupt context we may have
583 * interrupted the holder of this lock. To avoid a potential
584 * deadlock return if the lock is contended.
586 if (!spin_trylock(&skc
->skc_lock
))
589 __spl_cache_flush(skc
, skm
, skm
->skm_refill
);
590 spin_unlock(&skc
->skc_lock
);
594 * Called regularly to keep a downward pressure on the cache.
596 * Objects older than skc->skc_delay seconds in the per-cpu magazines will
597 * be returned to the caches. This is done to prevent idle magazines from
598 * holding memory which could be better used elsewhere. The delay is
599 * present to prevent thrashing the magazine.
601 * The newly released objects may result in empty partial slabs. Those
602 * slabs should be released to the system. Otherwise moving the objects
603 * out of the magazines is just wasted work.
606 spl_cache_age(void *data
)
608 spl_kmem_cache_t
*skc
= (spl_kmem_cache_t
*)data
;
611 ASSERT(skc
->skc_magic
== SKC_MAGIC
);
613 /* Dynamically disabled at run time */
614 if (!(spl_kmem_cache_expire
& KMC_EXPIRE_AGE
))
617 atomic_inc(&skc
->skc_ref
);
619 if (!(skc
->skc_flags
& KMC_NOMAGAZINE
))
620 on_each_cpu(spl_magazine_age
, skc
, 1);
622 spl_slab_reclaim(skc
, skc
->skc_reap
, 0);
624 while (!test_bit(KMC_BIT_DESTROY
, &skc
->skc_flags
) && !id
) {
625 id
= taskq_dispatch_delay(
626 spl_kmem_cache_taskq
, spl_cache_age
, skc
, TQ_SLEEP
,
627 ddi_get_lbolt() + skc
->skc_delay
/ 3 * HZ
);
629 /* Destroy issued after dispatch immediately cancel it */
630 if (test_bit(KMC_BIT_DESTROY
, &skc
->skc_flags
) && id
)
631 taskq_cancel_id(spl_kmem_cache_taskq
, id
);
634 spin_lock(&skc
->skc_lock
);
635 skc
->skc_taskqid
= id
;
636 spin_unlock(&skc
->skc_lock
);
638 atomic_dec(&skc
->skc_ref
);
642 * Size a slab based on the size of each aligned object plus spl_kmem_obj_t.
643 * When on-slab we want to target spl_kmem_cache_obj_per_slab. However,
644 * for very small objects we may end up with more than this so as not
645 * to waste space in the minimal allocation of a single page. Also for
646 * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min,
647 * lower than this and we will fail.
650 spl_slab_size(spl_kmem_cache_t
*skc
, uint32_t *objs
, uint32_t *size
)
652 uint32_t sks_size
, obj_size
, max_size
;
654 if (skc
->skc_flags
& KMC_OFFSLAB
) {
655 *objs
= spl_kmem_cache_obj_per_slab
;
656 *size
= P2ROUNDUP(sizeof (spl_kmem_slab_t
), PAGE_SIZE
);
659 sks_size
= spl_sks_size(skc
);
660 obj_size
= spl_obj_size(skc
);
662 if (skc
->skc_flags
& KMC_KMEM
)
663 max_size
= ((uint32_t)1 << (MAX_ORDER
-3)) * PAGE_SIZE
;
665 max_size
= (spl_kmem_cache_max_size
* 1024 * 1024);
667 /* Power of two sized slab */
668 for (*size
= PAGE_SIZE
; *size
<= max_size
; *size
*= 2) {
669 *objs
= (*size
- sks_size
) / obj_size
;
670 if (*objs
>= spl_kmem_cache_obj_per_slab
)
675 * Unable to satisfy target objects per slab, fall back to
676 * allocating a maximally sized slab and assuming it can
677 * contain the minimum objects count use it. If not fail.
680 *objs
= (*size
- sks_size
) / obj_size
;
681 if (*objs
>= (spl_kmem_cache_obj_per_slab_min
))
689 * Make a guess at reasonable per-cpu magazine size based on the size of
690 * each object and the cost of caching N of them in each magazine. Long
691 * term this should really adapt based on an observed usage heuristic.
694 spl_magazine_size(spl_kmem_cache_t
*skc
)
696 uint32_t obj_size
= spl_obj_size(skc
);
699 /* Per-magazine sizes below assume a 4Kib page size */
700 if (obj_size
> (PAGE_SIZE
* 256))
701 size
= 4; /* Minimum 4Mib per-magazine */
702 else if (obj_size
> (PAGE_SIZE
* 32))
703 size
= 16; /* Minimum 2Mib per-magazine */
704 else if (obj_size
> (PAGE_SIZE
))
705 size
= 64; /* Minimum 256Kib per-magazine */
706 else if (obj_size
> (PAGE_SIZE
/ 4))
707 size
= 128; /* Minimum 128Kib per-magazine */
715 * Allocate a per-cpu magazine to associate with a specific core.
717 static spl_kmem_magazine_t
*
718 spl_magazine_alloc(spl_kmem_cache_t
*skc
, int cpu
)
720 spl_kmem_magazine_t
*skm
;
721 int size
= sizeof (spl_kmem_magazine_t
) +
722 sizeof (void *) * skc
->skc_mag_size
;
724 skm
= kmalloc_node(size
, GFP_KERNEL
, cpu_to_node(cpu
));
726 skm
->skm_magic
= SKM_MAGIC
;
728 skm
->skm_size
= skc
->skc_mag_size
;
729 skm
->skm_refill
= skc
->skc_mag_refill
;
730 skm
->skm_cache
= skc
;
731 skm
->skm_age
= jiffies
;
739 * Free a per-cpu magazine associated with a specific core.
742 spl_magazine_free(spl_kmem_magazine_t
*skm
)
744 ASSERT(skm
->skm_magic
== SKM_MAGIC
);
745 ASSERT(skm
->skm_avail
== 0);
750 * Create all pre-cpu magazines of reasonable sizes.
753 spl_magazine_create(spl_kmem_cache_t
*skc
)
757 if (skc
->skc_flags
& KMC_NOMAGAZINE
)
760 skc
->skc_mag_size
= spl_magazine_size(skc
);
761 skc
->skc_mag_refill
= (skc
->skc_mag_size
+ 1) / 2;
763 for_each_online_cpu(i
) {
764 skc
->skc_mag
[i
] = spl_magazine_alloc(skc
, i
);
765 if (!skc
->skc_mag
[i
]) {
766 for (i
--; i
>= 0; i
--)
767 spl_magazine_free(skc
->skc_mag
[i
]);
777 * Destroy all pre-cpu magazines.
780 spl_magazine_destroy(spl_kmem_cache_t
*skc
)
782 spl_kmem_magazine_t
*skm
;
785 if (skc
->skc_flags
& KMC_NOMAGAZINE
)
788 for_each_online_cpu(i
) {
789 skm
= skc
->skc_mag
[i
];
790 spl_cache_flush(skc
, skm
, skm
->skm_avail
);
791 spl_magazine_free(skm
);
796 * Create a object cache based on the following arguments:
798 * size cache object size
799 * align cache object alignment
800 * ctor cache object constructor
801 * dtor cache object destructor
802 * reclaim cache object reclaim
803 * priv cache private data for ctor/dtor/reclaim
804 * vmp unused must be NULL
806 * KMC_NOTOUCH Disable cache object aging (unsupported)
807 * KMC_NODEBUG Disable debugging (unsupported)
808 * KMC_NOHASH Disable hashing (unsupported)
809 * KMC_QCACHE Disable qcache (unsupported)
810 * KMC_NOMAGAZINE Enabled for kmem/vmem, Disabled for Linux slab
811 * KMC_KMEM Force kmem backed cache
812 * KMC_VMEM Force vmem backed cache
813 * KMC_SLAB Force Linux slab backed cache
814 * KMC_OFFSLAB Locate objects off the slab
817 spl_kmem_cache_create(char *name
, size_t size
, size_t align
,
818 spl_kmem_ctor_t ctor
, spl_kmem_dtor_t dtor
, spl_kmem_reclaim_t reclaim
,
819 void *priv
, void *vmp
, int flags
)
821 gfp_t lflags
= kmem_flags_convert(KM_SLEEP
);
822 spl_kmem_cache_t
*skc
;
828 ASSERT0(flags
& KMC_NOMAGAZINE
);
829 ASSERT0(flags
& KMC_NOHASH
);
830 ASSERT0(flags
& KMC_QCACHE
);
836 * Allocate memory for a new cache and initialize it. Unfortunately,
837 * this usually ends up being a large allocation of ~32k because
838 * we need to allocate enough memory for the worst case number of
839 * cpus in the magazine, skc_mag[NR_CPUS].
841 skc
= kzalloc(sizeof (*skc
), lflags
);
845 skc
->skc_magic
= SKC_MAGIC
;
846 skc
->skc_name_size
= strlen(name
) + 1;
847 skc
->skc_name
= (char *)kmalloc(skc
->skc_name_size
, lflags
);
848 if (skc
->skc_name
== NULL
) {
852 strncpy(skc
->skc_name
, name
, skc
->skc_name_size
);
854 skc
->skc_ctor
= ctor
;
855 skc
->skc_dtor
= dtor
;
856 skc
->skc_reclaim
= reclaim
;
857 skc
->skc_private
= priv
;
859 skc
->skc_linux_cache
= NULL
;
860 skc
->skc_flags
= flags
;
861 skc
->skc_obj_size
= size
;
862 skc
->skc_obj_align
= SPL_KMEM_CACHE_ALIGN
;
863 skc
->skc_delay
= SPL_KMEM_CACHE_DELAY
;
864 skc
->skc_reap
= SPL_KMEM_CACHE_REAP
;
865 atomic_set(&skc
->skc_ref
, 0);
867 INIT_LIST_HEAD(&skc
->skc_list
);
868 INIT_LIST_HEAD(&skc
->skc_complete_list
);
869 INIT_LIST_HEAD(&skc
->skc_partial_list
);
870 skc
->skc_emergency_tree
= RB_ROOT
;
871 spin_lock_init(&skc
->skc_lock
);
872 init_waitqueue_head(&skc
->skc_waitq
);
873 skc
->skc_slab_fail
= 0;
874 skc
->skc_slab_create
= 0;
875 skc
->skc_slab_destroy
= 0;
876 skc
->skc_slab_total
= 0;
877 skc
->skc_slab_alloc
= 0;
878 skc
->skc_slab_max
= 0;
879 skc
->skc_obj_total
= 0;
880 skc
->skc_obj_alloc
= 0;
881 skc
->skc_obj_max
= 0;
882 skc
->skc_obj_deadlock
= 0;
883 skc
->skc_obj_emergency
= 0;
884 skc
->skc_obj_emergency_max
= 0;
887 * Verify the requested alignment restriction is sane.
891 VERIFY3U(align
, >=, SPL_KMEM_CACHE_ALIGN
);
892 VERIFY3U(align
, <=, PAGE_SIZE
);
893 skc
->skc_obj_align
= align
;
897 * When no specific type of slab is requested (kmem, vmem, or
898 * linuxslab) then select a cache type based on the object size
899 * and default tunables.
901 if (!(skc
->skc_flags
& (KMC_KMEM
| KMC_VMEM
| KMC_SLAB
))) {
904 * Objects smaller than spl_kmem_cache_slab_limit can
905 * use the Linux slab for better space-efficiency. By
906 * default this functionality is disabled until its
907 * performance characteristics are fully understood.
909 if (spl_kmem_cache_slab_limit
&&
910 size
<= (size_t)spl_kmem_cache_slab_limit
)
911 skc
->skc_flags
|= KMC_SLAB
;
914 * Small objects, less than spl_kmem_cache_kmem_limit per
915 * object should use kmem because their slabs are small.
917 else if (spl_obj_size(skc
) <= spl_kmem_cache_kmem_limit
)
918 skc
->skc_flags
|= KMC_KMEM
;
921 * All other objects are considered large and are placed
922 * on vmem backed slabs.
925 skc
->skc_flags
|= KMC_VMEM
;
929 * Given the type of slab allocate the required resources.
931 if (skc
->skc_flags
& (KMC_KMEM
| KMC_VMEM
)) {
932 rc
= spl_slab_size(skc
,
933 &skc
->skc_slab_objs
, &skc
->skc_slab_size
);
937 rc
= spl_magazine_create(skc
);
941 skc
->skc_linux_cache
= kmem_cache_create(
942 skc
->skc_name
, size
, align
, 0, NULL
);
943 if (skc
->skc_linux_cache
== NULL
) {
948 #if defined(HAVE_KMEM_CACHE_ALLOCFLAGS)
949 skc
->skc_linux_cache
->allocflags
|= __GFP_COMP
;
950 #elif defined(HAVE_KMEM_CACHE_GFPFLAGS)
951 skc
->skc_linux_cache
->gfpflags
|= __GFP_COMP
;
953 skc
->skc_flags
|= KMC_NOMAGAZINE
;
956 if (spl_kmem_cache_expire
& KMC_EXPIRE_AGE
)
957 skc
->skc_taskqid
= taskq_dispatch_delay(spl_kmem_cache_taskq
,
958 spl_cache_age
, skc
, TQ_SLEEP
,
959 ddi_get_lbolt() + skc
->skc_delay
/ 3 * HZ
);
961 down_write(&spl_kmem_cache_sem
);
962 list_add_tail(&skc
->skc_list
, &spl_kmem_cache_list
);
963 up_write(&spl_kmem_cache_sem
);
967 kfree(skc
->skc_name
);
971 EXPORT_SYMBOL(spl_kmem_cache_create
);
974 * Register a move callback for cache defragmentation.
975 * XXX: Unimplemented but harmless to stub out for now.
978 spl_kmem_cache_set_move(spl_kmem_cache_t
*skc
,
979 kmem_cbrc_t (move
)(void *, void *, size_t, void *))
981 ASSERT(move
!= NULL
);
983 EXPORT_SYMBOL(spl_kmem_cache_set_move
);
986 * Destroy a cache and all objects associated with the cache.
989 spl_kmem_cache_destroy(spl_kmem_cache_t
*skc
)
991 DECLARE_WAIT_QUEUE_HEAD(wq
);
994 ASSERT(skc
->skc_magic
== SKC_MAGIC
);
995 ASSERT(skc
->skc_flags
& (KMC_KMEM
| KMC_VMEM
| KMC_SLAB
));
997 down_write(&spl_kmem_cache_sem
);
998 list_del_init(&skc
->skc_list
);
999 up_write(&spl_kmem_cache_sem
);
1001 /* Cancel any and wait for any pending delayed tasks */
1002 VERIFY(!test_and_set_bit(KMC_BIT_DESTROY
, &skc
->skc_flags
));
1004 spin_lock(&skc
->skc_lock
);
1005 id
= skc
->skc_taskqid
;
1006 spin_unlock(&skc
->skc_lock
);
1008 taskq_cancel_id(spl_kmem_cache_taskq
, id
);
1011 * Wait until all current callers complete, this is mainly
1012 * to catch the case where a low memory situation triggers a
1013 * cache reaping action which races with this destroy.
1015 wait_event(wq
, atomic_read(&skc
->skc_ref
) == 0);
1017 if (skc
->skc_flags
& (KMC_KMEM
| KMC_VMEM
)) {
1018 spl_magazine_destroy(skc
);
1019 spl_slab_reclaim(skc
, 0, 1);
1021 ASSERT(skc
->skc_flags
& KMC_SLAB
);
1022 kmem_cache_destroy(skc
->skc_linux_cache
);
1025 spin_lock(&skc
->skc_lock
);
1028 * Validate there are no objects in use and free all the
1029 * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers.
1031 ASSERT3U(skc
->skc_slab_alloc
, ==, 0);
1032 ASSERT3U(skc
->skc_obj_alloc
, ==, 0);
1033 ASSERT3U(skc
->skc_slab_total
, ==, 0);
1034 ASSERT3U(skc
->skc_obj_total
, ==, 0);
1035 ASSERT3U(skc
->skc_obj_emergency
, ==, 0);
1036 ASSERT(list_empty(&skc
->skc_complete_list
));
1038 spin_unlock(&skc
->skc_lock
);
1040 kfree(skc
->skc_name
);
1043 EXPORT_SYMBOL(spl_kmem_cache_destroy
);
1046 * Allocate an object from a slab attached to the cache. This is used to
1047 * repopulate the per-cpu magazine caches in batches when they run low.
1050 spl_cache_obj(spl_kmem_cache_t
*skc
, spl_kmem_slab_t
*sks
)
1052 spl_kmem_obj_t
*sko
;
1054 ASSERT(skc
->skc_magic
== SKC_MAGIC
);
1055 ASSERT(sks
->sks_magic
== SKS_MAGIC
);
1056 ASSERT(spin_is_locked(&skc
->skc_lock
));
1058 sko
= list_entry(sks
->sks_free_list
.next
, spl_kmem_obj_t
, sko_list
);
1059 ASSERT(sko
->sko_magic
== SKO_MAGIC
);
1060 ASSERT(sko
->sko_addr
!= NULL
);
1062 /* Remove from sks_free_list */
1063 list_del_init(&sko
->sko_list
);
1065 sks
->sks_age
= jiffies
;
1067 skc
->skc_obj_alloc
++;
1069 /* Track max obj usage statistics */
1070 if (skc
->skc_obj_alloc
> skc
->skc_obj_max
)
1071 skc
->skc_obj_max
= skc
->skc_obj_alloc
;
1073 /* Track max slab usage statistics */
1074 if (sks
->sks_ref
== 1) {
1075 skc
->skc_slab_alloc
++;
1077 if (skc
->skc_slab_alloc
> skc
->skc_slab_max
)
1078 skc
->skc_slab_max
= skc
->skc_slab_alloc
;
1081 return (sko
->sko_addr
);
1085 * Generic slab allocation function to run by the global work queues.
1086 * It is responsible for allocating a new slab, linking it in to the list
1087 * of partial slabs, and then waking any waiters.
1090 spl_cache_grow_work(void *data
)
1092 spl_kmem_alloc_t
*ska
= (spl_kmem_alloc_t
*)data
;
1093 spl_kmem_cache_t
*skc
= ska
->ska_cache
;
1094 spl_kmem_slab_t
*sks
;
1096 #if defined(PF_MEMALLOC_NOIO)
1097 unsigned noio_flag
= memalloc_noio_save();
1098 sks
= spl_slab_alloc(skc
, ska
->ska_flags
);
1099 memalloc_noio_restore(noio_flag
);
1101 sks
= spl_slab_alloc(skc
, ska
->ska_flags
);
1103 spin_lock(&skc
->skc_lock
);
1105 skc
->skc_slab_total
++;
1106 skc
->skc_obj_total
+= sks
->sks_objs
;
1107 list_add_tail(&sks
->sks_list
, &skc
->skc_partial_list
);
1110 atomic_dec(&skc
->skc_ref
);
1111 clear_bit(KMC_BIT_GROWING
, &skc
->skc_flags
);
1112 clear_bit(KMC_BIT_DEADLOCKED
, &skc
->skc_flags
);
1113 wake_up_all(&skc
->skc_waitq
);
1114 spin_unlock(&skc
->skc_lock
);
1120 * Returns non-zero when a new slab should be available.
1123 spl_cache_grow_wait(spl_kmem_cache_t
*skc
)
1125 return (!test_bit(KMC_BIT_GROWING
, &skc
->skc_flags
));
1129 * No available objects on any slabs, create a new slab. Note that this
1130 * functionality is disabled for KMC_SLAB caches which are backed by the
1134 spl_cache_grow(spl_kmem_cache_t
*skc
, int flags
, void **obj
)
1136 int remaining
, rc
= 0;
1138 ASSERT0(flags
& ~KM_PUBLIC_MASK
);
1139 ASSERT(skc
->skc_magic
== SKC_MAGIC
);
1140 ASSERT((skc
->skc_flags
& KMC_SLAB
) == 0);
1145 * Before allocating a new slab wait for any reaping to complete and
1146 * then return so the local magazine can be rechecked for new objects.
1148 if (test_bit(KMC_BIT_REAPING
, &skc
->skc_flags
)) {
1149 rc
= spl_wait_on_bit(&skc
->skc_flags
, KMC_BIT_REAPING
,
1150 TASK_UNINTERRUPTIBLE
);
1151 return (rc
? rc
: -EAGAIN
);
1155 * This is handled by dispatching a work request to the global work
1156 * queue. This allows us to asynchronously allocate a new slab while
1157 * retaining the ability to safely fall back to a smaller synchronous
1158 * allocations to ensure forward progress is always maintained.
1160 if (test_and_set_bit(KMC_BIT_GROWING
, &skc
->skc_flags
) == 0) {
1161 spl_kmem_alloc_t
*ska
;
1163 ska
= kmalloc(sizeof (*ska
), kmem_flags_convert(flags
));
1165 clear_bit(KMC_BIT_GROWING
, &skc
->skc_flags
);
1166 wake_up_all(&skc
->skc_waitq
);
1170 atomic_inc(&skc
->skc_ref
);
1171 ska
->ska_cache
= skc
;
1172 ska
->ska_flags
= flags
;
1173 taskq_init_ent(&ska
->ska_tqe
);
1174 taskq_dispatch_ent(spl_kmem_cache_taskq
,
1175 spl_cache_grow_work
, ska
, 0, &ska
->ska_tqe
);
1179 * The goal here is to only detect the rare case where a virtual slab
1180 * allocation has deadlocked. We must be careful to minimize the use
1181 * of emergency objects which are more expensive to track. Therefore,
1182 * we set a very long timeout for the asynchronous allocation and if
1183 * the timeout is reached the cache is flagged as deadlocked. From
1184 * this point only new emergency objects will be allocated until the
1185 * asynchronous allocation completes and clears the deadlocked flag.
1187 if (test_bit(KMC_BIT_DEADLOCKED
, &skc
->skc_flags
)) {
1188 rc
= spl_emergency_alloc(skc
, flags
, obj
);
1190 remaining
= wait_event_timeout(skc
->skc_waitq
,
1191 spl_cache_grow_wait(skc
), HZ
);
1193 if (!remaining
&& test_bit(KMC_BIT_VMEM
, &skc
->skc_flags
)) {
1194 spin_lock(&skc
->skc_lock
);
1195 if (test_bit(KMC_BIT_GROWING
, &skc
->skc_flags
)) {
1196 set_bit(KMC_BIT_DEADLOCKED
, &skc
->skc_flags
);
1197 skc
->skc_obj_deadlock
++;
1199 spin_unlock(&skc
->skc_lock
);
1209 * Refill a per-cpu magazine with objects from the slabs for this cache.
1210 * Ideally the magazine can be repopulated using existing objects which have
1211 * been released, however if we are unable to locate enough free objects new
1212 * slabs of objects will be created. On success NULL is returned, otherwise
1213 * the address of a single emergency object is returned for use by the caller.
1216 spl_cache_refill(spl_kmem_cache_t
*skc
, spl_kmem_magazine_t
*skm
, int flags
)
1218 spl_kmem_slab_t
*sks
;
1219 int count
= 0, rc
, refill
;
1222 ASSERT(skc
->skc_magic
== SKC_MAGIC
);
1223 ASSERT(skm
->skm_magic
== SKM_MAGIC
);
1225 refill
= MIN(skm
->skm_refill
, skm
->skm_size
- skm
->skm_avail
);
1226 spin_lock(&skc
->skc_lock
);
1228 while (refill
> 0) {
1229 /* No slabs available we may need to grow the cache */
1230 if (list_empty(&skc
->skc_partial_list
)) {
1231 spin_unlock(&skc
->skc_lock
);
1234 rc
= spl_cache_grow(skc
, flags
, &obj
);
1235 local_irq_disable();
1237 /* Emergency object for immediate use by caller */
1238 if (rc
== 0 && obj
!= NULL
)
1244 /* Rescheduled to different CPU skm is not local */
1245 if (skm
!= skc
->skc_mag
[smp_processor_id()])
1249 * Potentially rescheduled to the same CPU but
1250 * allocations may have occurred from this CPU while
1251 * we were sleeping so recalculate max refill.
1253 refill
= MIN(refill
, skm
->skm_size
- skm
->skm_avail
);
1255 spin_lock(&skc
->skc_lock
);
1259 /* Grab the next available slab */
1260 sks
= list_entry((&skc
->skc_partial_list
)->next
,
1261 spl_kmem_slab_t
, sks_list
);
1262 ASSERT(sks
->sks_magic
== SKS_MAGIC
);
1263 ASSERT(sks
->sks_ref
< sks
->sks_objs
);
1264 ASSERT(!list_empty(&sks
->sks_free_list
));
1267 * Consume as many objects as needed to refill the requested
1268 * cache. We must also be careful not to overfill it.
1270 while (sks
->sks_ref
< sks
->sks_objs
&& refill
-- > 0 &&
1272 ASSERT(skm
->skm_avail
< skm
->skm_size
);
1273 ASSERT(count
< skm
->skm_size
);
1274 skm
->skm_objs
[skm
->skm_avail
++] =
1275 spl_cache_obj(skc
, sks
);
1278 /* Move slab to skc_complete_list when full */
1279 if (sks
->sks_ref
== sks
->sks_objs
) {
1280 list_del(&sks
->sks_list
);
1281 list_add(&sks
->sks_list
, &skc
->skc_complete_list
);
1285 spin_unlock(&skc
->skc_lock
);
1291 * Release an object back to the slab from which it came.
1294 spl_cache_shrink(spl_kmem_cache_t
*skc
, void *obj
)
1296 spl_kmem_slab_t
*sks
= NULL
;
1297 spl_kmem_obj_t
*sko
= NULL
;
1299 ASSERT(skc
->skc_magic
== SKC_MAGIC
);
1300 ASSERT(spin_is_locked(&skc
->skc_lock
));
1302 sko
= spl_sko_from_obj(skc
, obj
);
1303 ASSERT(sko
->sko_magic
== SKO_MAGIC
);
1304 sks
= sko
->sko_slab
;
1305 ASSERT(sks
->sks_magic
== SKS_MAGIC
);
1306 ASSERT(sks
->sks_cache
== skc
);
1307 list_add(&sko
->sko_list
, &sks
->sks_free_list
);
1309 sks
->sks_age
= jiffies
;
1311 skc
->skc_obj_alloc
--;
1314 * Move slab to skc_partial_list when no longer full. Slabs
1315 * are added to the head to keep the partial list is quasi-full
1316 * sorted order. Fuller at the head, emptier at the tail.
1318 if (sks
->sks_ref
== (sks
->sks_objs
- 1)) {
1319 list_del(&sks
->sks_list
);
1320 list_add(&sks
->sks_list
, &skc
->skc_partial_list
);
1324 * Move empty slabs to the end of the partial list so
1325 * they can be easily found and freed during reclamation.
1327 if (sks
->sks_ref
== 0) {
1328 list_del(&sks
->sks_list
);
1329 list_add_tail(&sks
->sks_list
, &skc
->skc_partial_list
);
1330 skc
->skc_slab_alloc
--;
1335 * Allocate an object from the per-cpu magazine, or if the magazine
1336 * is empty directly allocate from a slab and repopulate the magazine.
1339 spl_kmem_cache_alloc(spl_kmem_cache_t
*skc
, int flags
)
1341 spl_kmem_magazine_t
*skm
;
1344 ASSERT0(flags
& ~KM_PUBLIC_MASK
);
1345 ASSERT(skc
->skc_magic
== SKC_MAGIC
);
1346 ASSERT(!test_bit(KMC_BIT_DESTROY
, &skc
->skc_flags
));
1348 atomic_inc(&skc
->skc_ref
);
1351 * Allocate directly from a Linux slab. All optimizations are left
1352 * to the underlying cache we only need to guarantee that KM_SLEEP
1353 * callers will never fail.
1355 if (skc
->skc_flags
& KMC_SLAB
) {
1356 struct kmem_cache
*slc
= skc
->skc_linux_cache
;
1358 obj
= kmem_cache_alloc(slc
, kmem_flags_convert(flags
));
1359 } while ((obj
== NULL
) && !(flags
& KM_NOSLEEP
));
1364 local_irq_disable();
1368 * Safe to update per-cpu structure without lock, but
1369 * in the restart case we must be careful to reacquire
1370 * the local magazine since this may have changed
1371 * when we need to grow the cache.
1373 skm
= skc
->skc_mag
[smp_processor_id()];
1374 ASSERT(skm
->skm_magic
== SKM_MAGIC
);
1376 if (likely(skm
->skm_avail
)) {
1377 /* Object available in CPU cache, use it */
1378 obj
= skm
->skm_objs
[--skm
->skm_avail
];
1379 skm
->skm_age
= jiffies
;
1381 obj
= spl_cache_refill(skc
, skm
, flags
);
1388 ASSERT(IS_P2ALIGNED(obj
, skc
->skc_obj_align
));
1391 /* Pre-emptively migrate object to CPU L1 cache */
1393 if (obj
&& skc
->skc_ctor
)
1394 skc
->skc_ctor(obj
, skc
->skc_private
, flags
);
1399 atomic_dec(&skc
->skc_ref
);
1404 EXPORT_SYMBOL(spl_kmem_cache_alloc
);
1407 * Free an object back to the local per-cpu magazine, there is no
1408 * guarantee that this is the same magazine the object was originally
1409 * allocated from. We may need to flush entire from the magazine
1410 * back to the slabs to make space.
1413 spl_kmem_cache_free(spl_kmem_cache_t
*skc
, void *obj
)
1415 spl_kmem_magazine_t
*skm
;
1416 unsigned long flags
;
1418 ASSERT(skc
->skc_magic
== SKC_MAGIC
);
1419 ASSERT(!test_bit(KMC_BIT_DESTROY
, &skc
->skc_flags
));
1420 atomic_inc(&skc
->skc_ref
);
1423 * Run the destructor
1426 skc
->skc_dtor(obj
, skc
->skc_private
);
1429 * Free the object from the Linux underlying Linux slab.
1431 if (skc
->skc_flags
& KMC_SLAB
) {
1432 kmem_cache_free(skc
->skc_linux_cache
, obj
);
1437 * Only virtual slabs may have emergency objects and these objects
1438 * are guaranteed to have physical addresses. They must be removed
1439 * from the tree of emergency objects and the freed.
1441 if ((skc
->skc_flags
& KMC_VMEM
) && !is_vmalloc_addr(obj
)) {
1442 spl_emergency_free(skc
, obj
);
1446 local_irq_save(flags
);
1449 * Safe to update per-cpu structure without lock, but
1450 * no remote memory allocation tracking is being performed
1451 * it is entirely possible to allocate an object from one
1452 * CPU cache and return it to another.
1454 skm
= skc
->skc_mag
[smp_processor_id()];
1455 ASSERT(skm
->skm_magic
== SKM_MAGIC
);
1457 /* Per-CPU cache full, flush it to make space */
1458 if (unlikely(skm
->skm_avail
>= skm
->skm_size
))
1459 spl_cache_flush(skc
, skm
, skm
->skm_refill
);
1461 /* Available space in cache, use it */
1462 skm
->skm_objs
[skm
->skm_avail
++] = obj
;
1464 local_irq_restore(flags
);
1466 atomic_dec(&skc
->skc_ref
);
1468 EXPORT_SYMBOL(spl_kmem_cache_free
);
1471 * The generic shrinker function for all caches. Under Linux a shrinker
1472 * may not be tightly coupled with a slab cache. In fact Linux always
1473 * systematically tries calling all registered shrinker callbacks which
1474 * report that they contain unused objects. Because of this we only
1475 * register one shrinker function in the shim layer for all slab caches.
1476 * We always attempt to shrink all caches when this generic shrinker
1479 * If sc->nr_to_scan is zero, the caller is requesting a query of the
1480 * number of objects which can potentially be freed. If it is nonzero,
1481 * the request is to free that many objects.
1483 * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks
1484 * in struct shrinker and also require the shrinker to return the number
1487 * Older kernels require the shrinker to return the number of freeable
1488 * objects following the freeing of nr_to_free.
1490 * Linux semantics differ from those under Solaris, which are to
1491 * free all available objects which may (and probably will) be more
1492 * objects than the requested nr_to_scan.
1494 static spl_shrinker_t
1495 __spl_kmem_cache_generic_shrinker(struct shrinker
*shrink
,
1496 struct shrink_control
*sc
)
1498 spl_kmem_cache_t
*skc
;
1501 down_read(&spl_kmem_cache_sem
);
1502 list_for_each_entry(skc
, &spl_kmem_cache_list
, skc_list
) {
1503 if (sc
->nr_to_scan
) {
1504 #ifdef HAVE_SPLIT_SHRINKER_CALLBACK
1505 uint64_t oldalloc
= skc
->skc_obj_alloc
;
1506 spl_kmem_cache_reap_now(skc
,
1507 MAX(sc
->nr_to_scan
>>fls64(skc
->skc_slab_objs
), 1));
1508 if (oldalloc
> skc
->skc_obj_alloc
)
1509 alloc
+= oldalloc
- skc
->skc_obj_alloc
;
1511 spl_kmem_cache_reap_now(skc
,
1512 MAX(sc
->nr_to_scan
>>fls64(skc
->skc_slab_objs
), 1));
1513 alloc
+= skc
->skc_obj_alloc
;
1514 #endif /* HAVE_SPLIT_SHRINKER_CALLBACK */
1516 /* Request to query number of freeable objects */
1517 alloc
+= skc
->skc_obj_alloc
;
1520 up_read(&spl_kmem_cache_sem
);
1523 * When KMC_RECLAIM_ONCE is set allow only a single reclaim pass.
1524 * This functionality only exists to work around a rare issue where
1525 * shrink_slabs() is repeatedly invoked by many cores causing the
1528 if ((spl_kmem_cache_reclaim
& KMC_RECLAIM_ONCE
) && sc
->nr_to_scan
)
1529 return (SHRINK_STOP
);
1531 return (MAX(alloc
, 0));
1534 SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker
);
1537 * Call the registered reclaim function for a cache. Depending on how
1538 * many and which objects are released it may simply repopulate the
1539 * local magazine which will then need to age-out. Objects which cannot
1540 * fit in the magazine we will be released back to their slabs which will
1541 * also need to age out before being release. This is all just best
1542 * effort and we do not want to thrash creating and destroying slabs.
1545 spl_kmem_cache_reap_now(spl_kmem_cache_t
*skc
, int count
)
1547 ASSERT(skc
->skc_magic
== SKC_MAGIC
);
1548 ASSERT(!test_bit(KMC_BIT_DESTROY
, &skc
->skc_flags
));
1550 atomic_inc(&skc
->skc_ref
);
1553 * Execute the registered reclaim callback if it exists. The
1554 * per-cpu caches will be drained when is set KMC_EXPIRE_MEM.
1556 if (skc
->skc_flags
& KMC_SLAB
) {
1557 if (skc
->skc_reclaim
)
1558 skc
->skc_reclaim(skc
->skc_private
);
1560 if (spl_kmem_cache_expire
& KMC_EXPIRE_MEM
)
1561 kmem_cache_shrink(skc
->skc_linux_cache
);
1567 * Prevent concurrent cache reaping when contended.
1569 if (test_and_set_bit(KMC_BIT_REAPING
, &skc
->skc_flags
))
1573 * When a reclaim function is available it may be invoked repeatedly
1574 * until at least a single slab can be freed. This ensures that we
1575 * do free memory back to the system. This helps minimize the chance
1576 * of an OOM event when the bulk of memory is used by the slab.
1578 * When free slabs are already available the reclaim callback will be
1579 * skipped. Additionally, if no forward progress is detected despite
1580 * a reclaim function the cache will be skipped to avoid deadlock.
1582 * Longer term this would be the correct place to add the code which
1583 * repacks the slabs in order minimize fragmentation.
1585 if (skc
->skc_reclaim
) {
1586 uint64_t objects
= UINT64_MAX
;
1590 spin_lock(&skc
->skc_lock
);
1592 (skc
->skc_slab_total
> 0) &&
1593 ((skc
->skc_slab_total
-skc
->skc_slab_alloc
) == 0) &&
1594 (skc
->skc_obj_alloc
< objects
);
1596 objects
= skc
->skc_obj_alloc
;
1597 spin_unlock(&skc
->skc_lock
);
1600 skc
->skc_reclaim(skc
->skc_private
);
1602 } while (do_reclaim
);
1605 /* Reclaim from the magazine then the slabs ignoring age and delay. */
1606 if (spl_kmem_cache_expire
& KMC_EXPIRE_MEM
) {
1607 spl_kmem_magazine_t
*skm
;
1608 unsigned long irq_flags
;
1610 local_irq_save(irq_flags
);
1611 skm
= skc
->skc_mag
[smp_processor_id()];
1612 spl_cache_flush(skc
, skm
, skm
->skm_avail
);
1613 local_irq_restore(irq_flags
);
1616 spl_slab_reclaim(skc
, count
, 1);
1617 clear_bit(KMC_BIT_REAPING
, &skc
->skc_flags
);
1619 wake_up_bit(&skc
->skc_flags
, KMC_BIT_REAPING
);
1621 atomic_dec(&skc
->skc_ref
);
1623 EXPORT_SYMBOL(spl_kmem_cache_reap_now
);
1626 * Reap all free slabs from all registered caches.
1631 struct shrink_control sc
;
1633 sc
.nr_to_scan
= KMC_REAP_CHUNK
;
1634 sc
.gfp_mask
= GFP_KERNEL
;
1636 (void) __spl_kmem_cache_generic_shrinker(NULL
, &sc
);
1638 EXPORT_SYMBOL(spl_kmem_reap
);
1641 spl_kmem_cache_init(void)
1643 init_rwsem(&spl_kmem_cache_sem
);
1644 INIT_LIST_HEAD(&spl_kmem_cache_list
);
1645 spl_kmem_cache_taskq
= taskq_create("spl_kmem_cache",
1646 1, maxclsyspri
, 1, 32, TASKQ_PREPOPULATE
);
1647 spl_register_shrinker(&spl_kmem_cache_shrinker
);
1653 spl_kmem_cache_fini(void)
1655 spl_unregister_shrinker(&spl_kmem_cache_shrinker
);
1656 taskq_destroy(spl_kmem_cache_taskq
);