]> git.proxmox.com Git - mirror_zfs.git/blame - module/spl/spl-kmem-cache.c
Refactor existing code
[mirror_zfs.git] / module / spl / spl-kmem-cache.c
CommitLineData
e5b9b344
BB
1/*****************************************************************************\
2 * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
3 * Copyright (C) 2007 The Regents of the University of California.
4 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
5 * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
6 * UCRL-CODE-235197
7 *
8 * This file is part of the SPL, Solaris Porting Layer.
9 * For details, see <http://zfsonlinux.org/>.
10 *
11 * The SPL is free software; you can redistribute it and/or modify it
12 * under the terms of the GNU General Public License as published by the
13 * Free Software Foundation; either version 2 of the License, or (at your
14 * option) any later version.
15 *
16 * The SPL is distributed in the hope that it will be useful, but WITHOUT
17 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
18 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
19 * for more details.
20 *
21 * You should have received a copy of the GNU General Public License along
22 * with the SPL. If not, see <http://www.gnu.org/licenses/>.
23 *****************************************************************************
24 * Solaris Porting Layer (SPL) Kmem Implementation.
25\*****************************************************************************/
26
27#include <sys/kmem.h>
28#include <sys/kmem_cache.h>
29#include <sys/taskq.h>
30#include <sys/timer.h>
31#include <sys/vmem.h>
32#include <linux/slab.h>
33#include <linux/swap.h>
34#include <linux/mm_compat.h>
35#include <linux/wait_compat.h>
36
37/*
38 * Within the scope of spl-kmem.c file the kmem_cache_* definitions
39 * are removed to allow access to the real Linux slab allocator.
40 */
41#undef kmem_cache_destroy
42#undef kmem_cache_create
43#undef kmem_cache_alloc
44#undef kmem_cache_free
45
46
47/*
48 * Cache expiration was implemented because it was part of the default Solaris
49 * kmem_cache behavior. The idea is that per-cpu objects which haven't been
50 * accessed in several seconds should be returned to the cache. On the other
51 * hand Linux slabs never move objects back to the slabs unless there is
52 * memory pressure on the system. By default the Linux method is enabled
53 * because it has been shown to improve responsiveness on low memory systems.
54 * This policy may be changed by setting KMC_EXPIRE_AGE or KMC_EXPIRE_MEM.
55 */
56unsigned int spl_kmem_cache_expire = KMC_EXPIRE_MEM;
57EXPORT_SYMBOL(spl_kmem_cache_expire);
58module_param(spl_kmem_cache_expire, uint, 0644);
59MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)");
60
61/*
62 * The default behavior is to report the number of objects remaining in the
63 * cache. This allows the Linux VM to repeatedly reclaim objects from the
64 * cache when memory is low satisfy other memory allocations. Alternately,
65 * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache
66 * is reclaimed. This may increase the likelihood of out of memory events.
67 */
68unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */;
69module_param(spl_kmem_cache_reclaim, uint, 0644);
70MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)");
71
72unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB;
73module_param(spl_kmem_cache_obj_per_slab, uint, 0644);
74MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab");
75
76unsigned int spl_kmem_cache_obj_per_slab_min = SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN;
77module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644);
78MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min,
79 "Minimal number of objects per slab");
80
81unsigned int spl_kmem_cache_max_size = 32;
82module_param(spl_kmem_cache_max_size, uint, 0644);
83MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB");
84
85/*
86 * For small objects the Linux slab allocator should be used to make the most
87 * efficient use of the memory. However, large objects are not supported by
88 * the Linux slab and therefore the SPL implementation is preferred. A cutoff
89 * of 16K was determined to be optimal for architectures using 4K pages.
90 */
91#if PAGE_SIZE == 4096
92unsigned int spl_kmem_cache_slab_limit = 16384;
93#else
94unsigned int spl_kmem_cache_slab_limit = 0;
95#endif
96module_param(spl_kmem_cache_slab_limit, uint, 0644);
97MODULE_PARM_DESC(spl_kmem_cache_slab_limit,
98 "Objects less than N bytes use the Linux slab");
99
100unsigned int spl_kmem_cache_kmem_limit = (PAGE_SIZE / 4);
101module_param(spl_kmem_cache_kmem_limit, uint, 0644);
102MODULE_PARM_DESC(spl_kmem_cache_kmem_limit,
103 "Objects less than N bytes use the kmalloc");
104
105/*
106 * Slab allocation interfaces
107 *
108 * While the Linux slab implementation was inspired by the Solaris
109 * implementation I cannot use it to emulate the Solaris APIs. I
110 * require two features which are not provided by the Linux slab.
111 *
112 * 1) Constructors AND destructors. Recent versions of the Linux
113 * kernel have removed support for destructors. This is a deal
114 * breaker for the SPL which contains particularly expensive
115 * initializers for mutex's, condition variables, etc. We also
116 * require a minimal level of cleanup for these data types unlike
117 * many Linux data type which do need to be explicitly destroyed.
118 *
119 * 2) Virtual address space backed slab. Callers of the Solaris slab
120 * expect it to work well for both small are very large allocations.
121 * Because of memory fragmentation the Linux slab which is backed
122 * by kmalloc'ed memory performs very badly when confronted with
123 * large numbers of large allocations. Basing the slab on the
124 * virtual address space removes the need for contiguous pages
125 * and greatly improve performance for large allocations.
126 *
127 * For these reasons, the SPL has its own slab implementation with
128 * the needed features. It is not as highly optimized as either the
129 * Solaris or Linux slabs, but it should get me most of what is
130 * needed until it can be optimized or obsoleted by another approach.
131 *
132 * One serious concern I do have about this method is the relatively
133 * small virtual address space on 32bit arches. This will seriously
134 * constrain the size of the slab caches and their performance.
135 *
136 * XXX: Improve the partial slab list by carefully maintaining a
137 * strict ordering of fullest to emptiest slabs based on
138 * the slab reference count. This guarantees the when freeing
139 * slabs back to the system we need only linearly traverse the
140 * last N slabs in the list to discover all the freeable slabs.
141 *
142 * XXX: NUMA awareness for optionally allocating memory close to a
143 * particular core. This can be advantageous if you know the slab
144 * object will be short lived and primarily accessed from one core.
145 *
146 * XXX: Slab coloring may also yield performance improvements and would
147 * be desirable to implement.
148 */
149
150struct list_head spl_kmem_cache_list; /* List of caches */
151struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
152taskq_t *spl_kmem_cache_taskq; /* Task queue for ageing / reclaim */
153
154static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj);
155
156SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker);
157SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker,
158 spl_kmem_cache_generic_shrinker, KMC_DEFAULT_SEEKS);
159
160static void *
161kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
162{
163 void *ptr;
164
165 ASSERT(ISP2(size));
166
167 if (skc->skc_flags & KMC_KMEM)
168 ptr = (void *)__get_free_pages(flags | __GFP_COMP,
169 get_order(size));
170 else
171 ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
172
173 /* Resulting allocated memory will be page aligned */
174 ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
175
176 return ptr;
177}
178
179static void
180kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
181{
182 ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
183 ASSERT(ISP2(size));
184
185 /*
186 * The Linux direct reclaim path uses this out of band value to
187 * determine if forward progress is being made. Normally this is
188 * incremented by kmem_freepages() which is part of the various
189 * Linux slab implementations. However, since we are using none
190 * of that infrastructure we are responsible for incrementing it.
191 */
192 if (current->reclaim_state)
193 current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
194
195 if (skc->skc_flags & KMC_KMEM)
196 free_pages((unsigned long)ptr, get_order(size));
197 else
198 vfree(ptr);
199}
200
201/*
202 * Required space for each aligned sks.
203 */
204static inline uint32_t
205spl_sks_size(spl_kmem_cache_t *skc)
206{
207 return P2ROUNDUP_TYPED(sizeof(spl_kmem_slab_t),
208 skc->skc_obj_align, uint32_t);
209}
210
211/*
212 * Required space for each aligned object.
213 */
214static inline uint32_t
215spl_obj_size(spl_kmem_cache_t *skc)
216{
217 uint32_t align = skc->skc_obj_align;
218
219 return P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) +
220 P2ROUNDUP_TYPED(sizeof(spl_kmem_obj_t), align, uint32_t);
221}
222
223/*
224 * Lookup the spl_kmem_object_t for an object given that object.
225 */
226static inline spl_kmem_obj_t *
227spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj)
228{
229 return obj + P2ROUNDUP_TYPED(skc->skc_obj_size,
230 skc->skc_obj_align, uint32_t);
231}
232
233/*
234 * Required space for each offslab object taking in to account alignment
235 * restrictions and the power-of-two requirement of kv_alloc().
236 */
237static inline uint32_t
238spl_offslab_size(spl_kmem_cache_t *skc)
239{
240 return 1UL << (fls64(spl_obj_size(skc)) + 1);
241}
242
243/*
244 * It's important that we pack the spl_kmem_obj_t structure and the
245 * actual objects in to one large address space to minimize the number
246 * of calls to the allocator. It is far better to do a few large
247 * allocations and then subdivide it ourselves. Now which allocator
248 * we use requires balancing a few trade offs.
249 *
250 * For small objects we use kmem_alloc() because as long as you are
251 * only requesting a small number of pages (ideally just one) its cheap.
252 * However, when you start requesting multiple pages with kmem_alloc()
253 * it gets increasingly expensive since it requires contiguous pages.
254 * For this reason we shift to vmem_alloc() for slabs of large objects
255 * which removes the need for contiguous pages. We do not use
256 * vmem_alloc() in all cases because there is significant locking
257 * overhead in __get_vm_area_node(). This function takes a single
258 * global lock when acquiring an available virtual address range which
259 * serializes all vmem_alloc()'s for all slab caches. Using slightly
260 * different allocation functions for small and large objects should
261 * give us the best of both worlds.
262 *
263 * KMC_ONSLAB KMC_OFFSLAB
264 *
265 * +------------------------+ +-----------------+
266 * | spl_kmem_slab_t --+-+ | | spl_kmem_slab_t |---+-+
267 * | skc_obj_size <-+ | | +-----------------+ | |
268 * | spl_kmem_obj_t | | | |
269 * | skc_obj_size <---+ | +-----------------+ | |
270 * | spl_kmem_obj_t | | | skc_obj_size | <-+ |
271 * | ... v | | spl_kmem_obj_t | |
272 * +------------------------+ +-----------------+ v
273 */
274static spl_kmem_slab_t *
275spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
276{
277 spl_kmem_slab_t *sks;
278 spl_kmem_obj_t *sko, *n;
279 void *base, *obj;
280 uint32_t obj_size, offslab_size = 0;
281 int i, rc = 0;
282
283 base = kv_alloc(skc, skc->skc_slab_size, flags);
284 if (base == NULL)
285 return (NULL);
286
287 sks = (spl_kmem_slab_t *)base;
288 sks->sks_magic = SKS_MAGIC;
289 sks->sks_objs = skc->skc_slab_objs;
290 sks->sks_age = jiffies;
291 sks->sks_cache = skc;
292 INIT_LIST_HEAD(&sks->sks_list);
293 INIT_LIST_HEAD(&sks->sks_free_list);
294 sks->sks_ref = 0;
295 obj_size = spl_obj_size(skc);
296
297 if (skc->skc_flags & KMC_OFFSLAB)
298 offslab_size = spl_offslab_size(skc);
299
300 for (i = 0; i < sks->sks_objs; i++) {
301 if (skc->skc_flags & KMC_OFFSLAB) {
302 obj = kv_alloc(skc, offslab_size, flags);
303 if (!obj) {
304 rc = -ENOMEM;
305 goto out;
306 }
307 } else {
308 obj = base + spl_sks_size(skc) + (i * obj_size);
309 }
310
311 ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
312 sko = spl_sko_from_obj(skc, obj);
313 sko->sko_addr = obj;
314 sko->sko_magic = SKO_MAGIC;
315 sko->sko_slab = sks;
316 INIT_LIST_HEAD(&sko->sko_list);
317 list_add_tail(&sko->sko_list, &sks->sks_free_list);
318 }
319
320out:
321 if (rc) {
322 if (skc->skc_flags & KMC_OFFSLAB)
323 list_for_each_entry_safe(sko, n, &sks->sks_free_list,
324 sko_list)
325 kv_free(skc, sko->sko_addr, offslab_size);
326
327 kv_free(skc, base, skc->skc_slab_size);
328 sks = NULL;
329 }
330
331 return (sks);
332}
333
334/*
335 * Remove a slab from complete or partial list, it must be called with
336 * the 'skc->skc_lock' held but the actual free must be performed
337 * outside the lock to prevent deadlocking on vmem addresses.
338 */
339static void
340spl_slab_free(spl_kmem_slab_t *sks,
341 struct list_head *sks_list, struct list_head *sko_list)
342{
343 spl_kmem_cache_t *skc;
344
345 ASSERT(sks->sks_magic == SKS_MAGIC);
346 ASSERT(sks->sks_ref == 0);
347
348 skc = sks->sks_cache;
349 ASSERT(skc->skc_magic == SKC_MAGIC);
350 ASSERT(spin_is_locked(&skc->skc_lock));
351
352 /*
353 * Update slab/objects counters in the cache, then remove the
354 * slab from the skc->skc_partial_list. Finally add the slab
355 * and all its objects in to the private work lists where the
356 * destructors will be called and the memory freed to the system.
357 */
358 skc->skc_obj_total -= sks->sks_objs;
359 skc->skc_slab_total--;
360 list_del(&sks->sks_list);
361 list_add(&sks->sks_list, sks_list);
362 list_splice_init(&sks->sks_free_list, sko_list);
363}
364
365/*
366 * Traverses all the partial slabs attached to a cache and free those
367 * which which are currently empty, and have not been touched for
368 * skc_delay seconds to avoid thrashing. The count argument is
369 * passed to optionally cap the number of slabs reclaimed, a count
370 * of zero means try and reclaim everything. When flag is set we
371 * always free an available slab regardless of age.
372 */
373static void
374spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
375{
376 spl_kmem_slab_t *sks, *m;
377 spl_kmem_obj_t *sko, *n;
378 LIST_HEAD(sks_list);
379 LIST_HEAD(sko_list);
380 uint32_t size = 0;
381 int i = 0;
382
383 /*
384 * Move empty slabs and objects which have not been touched in
385 * skc_delay seconds on to private lists to be freed outside
386 * the spin lock. This delay time is important to avoid thrashing
387 * however when flag is set the delay will not be used.
388 */
389 spin_lock(&skc->skc_lock);
390 list_for_each_entry_safe_reverse(sks,m,&skc->skc_partial_list,sks_list){
391 /*
392 * All empty slabs are at the end of skc->skc_partial_list,
393 * therefore once a non-empty slab is found we can stop
394 * scanning. Additionally, stop when reaching the target
395 * reclaim 'count' if a non-zero threshold is given.
396 */
397 if ((sks->sks_ref > 0) || (count && i >= count))
398 break;
399
400 if (time_after(jiffies,sks->sks_age+skc->skc_delay*HZ)||flag) {
401 spl_slab_free(sks, &sks_list, &sko_list);
402 i++;
403 }
404 }
405 spin_unlock(&skc->skc_lock);
406
407 /*
408 * The following two loops ensure all the object destructors are
409 * run, any offslab objects are freed, and the slabs themselves
410 * are freed. This is all done outside the skc->skc_lock since
411 * this allows the destructor to sleep, and allows us to perform
412 * a conditional reschedule when a freeing a large number of
413 * objects and slabs back to the system.
414 */
415 if (skc->skc_flags & KMC_OFFSLAB)
416 size = spl_offslab_size(skc);
417
418 list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
419 ASSERT(sko->sko_magic == SKO_MAGIC);
420
421 if (skc->skc_flags & KMC_OFFSLAB)
422 kv_free(skc, sko->sko_addr, size);
423 }
424
425 list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
426 ASSERT(sks->sks_magic == SKS_MAGIC);
427 kv_free(skc, sks, skc->skc_slab_size);
428 }
429}
430
431static spl_kmem_emergency_t *
432spl_emergency_search(struct rb_root *root, void *obj)
433{
434 struct rb_node *node = root->rb_node;
435 spl_kmem_emergency_t *ske;
436 unsigned long address = (unsigned long)obj;
437
438 while (node) {
439 ske = container_of(node, spl_kmem_emergency_t, ske_node);
440
441 if (address < (unsigned long)ske->ske_obj)
442 node = node->rb_left;
443 else if (address > (unsigned long)ske->ske_obj)
444 node = node->rb_right;
445 else
446 return ske;
447 }
448
449 return NULL;
450}
451
452static int
453spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
454{
455 struct rb_node **new = &(root->rb_node), *parent = NULL;
456 spl_kmem_emergency_t *ske_tmp;
457 unsigned long address = (unsigned long)ske->ske_obj;
458
459 while (*new) {
460 ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
461
462 parent = *new;
463 if (address < (unsigned long)ske_tmp->ske_obj)
464 new = &((*new)->rb_left);
465 else if (address > (unsigned long)ske_tmp->ske_obj)
466 new = &((*new)->rb_right);
467 else
468 return 0;
469 }
470
471 rb_link_node(&ske->ske_node, parent, new);
472 rb_insert_color(&ske->ske_node, root);
473
474 return 1;
475}
476
477/*
478 * Allocate a single emergency object and track it in a red black tree.
479 */
480static int
481spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
482{
483 spl_kmem_emergency_t *ske;
484 int empty;
485
486 /* Last chance use a partial slab if one now exists */
487 spin_lock(&skc->skc_lock);
488 empty = list_empty(&skc->skc_partial_list);
489 spin_unlock(&skc->skc_lock);
490 if (!empty)
491 return (-EEXIST);
492
493 ske = kmalloc(sizeof(*ske), flags);
494 if (ske == NULL)
495 return (-ENOMEM);
496
497 ske->ske_obj = kmalloc(skc->skc_obj_size, flags);
498 if (ske->ske_obj == NULL) {
499 kfree(ske);
500 return (-ENOMEM);
501 }
502
503 spin_lock(&skc->skc_lock);
504 empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
505 if (likely(empty)) {
506 skc->skc_obj_total++;
507 skc->skc_obj_emergency++;
508 if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
509 skc->skc_obj_emergency_max = skc->skc_obj_emergency;
510 }
511 spin_unlock(&skc->skc_lock);
512
513 if (unlikely(!empty)) {
514 kfree(ske->ske_obj);
515 kfree(ske);
516 return (-EINVAL);
517 }
518
519 *obj = ske->ske_obj;
520
521 return (0);
522}
523
524/*
525 * Locate the passed object in the red black tree and free it.
526 */
527static int
528spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
529{
530 spl_kmem_emergency_t *ske;
531
532 spin_lock(&skc->skc_lock);
533 ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
534 if (likely(ske)) {
535 rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
536 skc->skc_obj_emergency--;
537 skc->skc_obj_total--;
538 }
539 spin_unlock(&skc->skc_lock);
540
541 if (unlikely(ske == NULL))
542 return (-ENOENT);
543
544 kfree(ske->ske_obj);
545 kfree(ske);
546
547 return (0);
548}
549
550/*
551 * Release objects from the per-cpu magazine back to their slab. The flush
552 * argument contains the max number of entries to remove from the magazine.
553 */
554static void
555__spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
556{
557 int i, count = MIN(flush, skm->skm_avail);
558
559 ASSERT(skc->skc_magic == SKC_MAGIC);
560 ASSERT(skm->skm_magic == SKM_MAGIC);
561 ASSERT(spin_is_locked(&skc->skc_lock));
562
563 for (i = 0; i < count; i++)
564 spl_cache_shrink(skc, skm->skm_objs[i]);
565
566 skm->skm_avail -= count;
567 memmove(skm->skm_objs, &(skm->skm_objs[count]),
568 sizeof(void *) * skm->skm_avail);
569}
570
571static void
572spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
573{
574 spin_lock(&skc->skc_lock);
575 __spl_cache_flush(skc, skm, flush);
576 spin_unlock(&skc->skc_lock);
577}
578
579static void
580spl_magazine_age(void *data)
581{
582 spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
583 spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
584
585 ASSERT(skm->skm_magic == SKM_MAGIC);
586 ASSERT(skm->skm_cpu == smp_processor_id());
587 ASSERT(irqs_disabled());
588
589 /* There are no available objects or they are too young to age out */
590 if ((skm->skm_avail == 0) ||
591 time_before(jiffies, skm->skm_age + skc->skc_delay * HZ))
592 return;
593
594 /*
595 * Because we're executing in interrupt context we may have
596 * interrupted the holder of this lock. To avoid a potential
597 * deadlock return if the lock is contended.
598 */
599 if (!spin_trylock(&skc->skc_lock))
600 return;
601
602 __spl_cache_flush(skc, skm, skm->skm_refill);
603 spin_unlock(&skc->skc_lock);
604}
605
606/*
607 * Called regularly to keep a downward pressure on the cache.
608 *
609 * Objects older than skc->skc_delay seconds in the per-cpu magazines will
610 * be returned to the caches. This is done to prevent idle magazines from
611 * holding memory which could be better used elsewhere. The delay is
612 * present to prevent thrashing the magazine.
613 *
614 * The newly released objects may result in empty partial slabs. Those
615 * slabs should be released to the system. Otherwise moving the objects
616 * out of the magazines is just wasted work.
617 */
618static void
619spl_cache_age(void *data)
620{
621 spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
622 taskqid_t id = 0;
623
624 ASSERT(skc->skc_magic == SKC_MAGIC);
625
626 /* Dynamically disabled at run time */
627 if (!(spl_kmem_cache_expire & KMC_EXPIRE_AGE))
628 return;
629
630 atomic_inc(&skc->skc_ref);
631
632 if (!(skc->skc_flags & KMC_NOMAGAZINE))
633 on_each_cpu(spl_magazine_age, skc, 1);
634
635 spl_slab_reclaim(skc, skc->skc_reap, 0);
636
637 while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) {
638 id = taskq_dispatch_delay(
639 spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP,
640 ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
641
642 /* Destroy issued after dispatch immediately cancel it */
643 if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id)
644 taskq_cancel_id(spl_kmem_cache_taskq, id);
645 }
646
647 spin_lock(&skc->skc_lock);
648 skc->skc_taskqid = id;
649 spin_unlock(&skc->skc_lock);
650
651 atomic_dec(&skc->skc_ref);
652}
653
654/*
655 * Size a slab based on the size of each aligned object plus spl_kmem_obj_t.
656 * When on-slab we want to target spl_kmem_cache_obj_per_slab. However,
657 * for very small objects we may end up with more than this so as not
658 * to waste space in the minimal allocation of a single page. Also for
659 * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min,
660 * lower than this and we will fail.
661 */
662static int
663spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
664{
665 uint32_t sks_size, obj_size, max_size;
666
667 if (skc->skc_flags & KMC_OFFSLAB) {
668 *objs = spl_kmem_cache_obj_per_slab;
669 *size = P2ROUNDUP(sizeof(spl_kmem_slab_t), PAGE_SIZE);
670 return (0);
671 } else {
672 sks_size = spl_sks_size(skc);
673 obj_size = spl_obj_size(skc);
674
675 if (skc->skc_flags & KMC_KMEM)
676 max_size = ((uint32_t)1 << (MAX_ORDER-3)) * PAGE_SIZE;
677 else
678 max_size = (spl_kmem_cache_max_size * 1024 * 1024);
679
680 /* Power of two sized slab */
681 for (*size = PAGE_SIZE; *size <= max_size; *size *= 2) {
682 *objs = (*size - sks_size) / obj_size;
683 if (*objs >= spl_kmem_cache_obj_per_slab)
684 return (0);
685 }
686
687 /*
688 * Unable to satisfy target objects per slab, fall back to
689 * allocating a maximally sized slab and assuming it can
690 * contain the minimum objects count use it. If not fail.
691 */
692 *size = max_size;
693 *objs = (*size - sks_size) / obj_size;
694 if (*objs >= (spl_kmem_cache_obj_per_slab_min))
695 return (0);
696 }
697
698 return (-ENOSPC);
699}
700
701/*
702 * Make a guess at reasonable per-cpu magazine size based on the size of
703 * each object and the cost of caching N of them in each magazine. Long
704 * term this should really adapt based on an observed usage heuristic.
705 */
706static int
707spl_magazine_size(spl_kmem_cache_t *skc)
708{
709 uint32_t obj_size = spl_obj_size(skc);
710 int size;
711
712 /* Per-magazine sizes below assume a 4Kib page size */
713 if (obj_size > (PAGE_SIZE * 256))
714 size = 4; /* Minimum 4Mib per-magazine */
715 else if (obj_size > (PAGE_SIZE * 32))
716 size = 16; /* Minimum 2Mib per-magazine */
717 else if (obj_size > (PAGE_SIZE))
718 size = 64; /* Minimum 256Kib per-magazine */
719 else if (obj_size > (PAGE_SIZE / 4))
720 size = 128; /* Minimum 128Kib per-magazine */
721 else
722 size = 256;
723
724 return (size);
725}
726
727/*
728 * Allocate a per-cpu magazine to associate with a specific core.
729 */
730static spl_kmem_magazine_t *
731spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu)
732{
733 spl_kmem_magazine_t *skm;
734 int size = sizeof(spl_kmem_magazine_t) +
735 sizeof(void *) * skc->skc_mag_size;
736
737 skm = kmem_alloc_node(size, KM_SLEEP, cpu_to_node(cpu));
738 if (skm) {
739 skm->skm_magic = SKM_MAGIC;
740 skm->skm_avail = 0;
741 skm->skm_size = skc->skc_mag_size;
742 skm->skm_refill = skc->skc_mag_refill;
743 skm->skm_cache = skc;
744 skm->skm_age = jiffies;
745 skm->skm_cpu = cpu;
746 }
747
748 return (skm);
749}
750
751/*
752 * Free a per-cpu magazine associated with a specific core.
753 */
754static void
755spl_magazine_free(spl_kmem_magazine_t *skm)
756{
757 int size = sizeof(spl_kmem_magazine_t) +
758 sizeof(void *) * skm->skm_size;
759
760 ASSERT(skm->skm_magic == SKM_MAGIC);
761 ASSERT(skm->skm_avail == 0);
762
763 kmem_free(skm, size);
764}
765
766/*
767 * Create all pre-cpu magazines of reasonable sizes.
768 */
769static int
770spl_magazine_create(spl_kmem_cache_t *skc)
771{
772 int i;
773
774 if (skc->skc_flags & KMC_NOMAGAZINE)
775 return (0);
776
777 skc->skc_mag_size = spl_magazine_size(skc);
778 skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
779
780 for_each_online_cpu(i) {
781 skc->skc_mag[i] = spl_magazine_alloc(skc, i);
782 if (!skc->skc_mag[i]) {
783 for (i--; i >= 0; i--)
784 spl_magazine_free(skc->skc_mag[i]);
785
786 return (-ENOMEM);
787 }
788 }
789
790 return (0);
791}
792
793/*
794 * Destroy all pre-cpu magazines.
795 */
796static void
797spl_magazine_destroy(spl_kmem_cache_t *skc)
798{
799 spl_kmem_magazine_t *skm;
800 int i;
801
802 if (skc->skc_flags & KMC_NOMAGAZINE)
803 return;
804
805 for_each_online_cpu(i) {
806 skm = skc->skc_mag[i];
807 spl_cache_flush(skc, skm, skm->skm_avail);
808 spl_magazine_free(skm);
809 }
810}
811
812/*
813 * Create a object cache based on the following arguments:
814 * name cache name
815 * size cache object size
816 * align cache object alignment
817 * ctor cache object constructor
818 * dtor cache object destructor
819 * reclaim cache object reclaim
820 * priv cache private data for ctor/dtor/reclaim
821 * vmp unused must be NULL
822 * flags
823 * KMC_NOTOUCH Disable cache object aging (unsupported)
824 * KMC_NODEBUG Disable debugging (unsupported)
825 * KMC_NOHASH Disable hashing (unsupported)
826 * KMC_QCACHE Disable qcache (unsupported)
827 * KMC_NOMAGAZINE Enabled for kmem/vmem, Disabled for Linux slab
828 * KMC_KMEM Force kmem backed cache
829 * KMC_VMEM Force vmem backed cache
830 * KMC_SLAB Force Linux slab backed cache
831 * KMC_OFFSLAB Locate objects off the slab
832 */
833spl_kmem_cache_t *
834spl_kmem_cache_create(char *name, size_t size, size_t align,
835 spl_kmem_ctor_t ctor,
836 spl_kmem_dtor_t dtor,
837 spl_kmem_reclaim_t reclaim,
838 void *priv, void *vmp, int flags)
839{
840 spl_kmem_cache_t *skc;
841 int rc;
842
843 /*
844 * Unsupported flags
845 */
846 ASSERT0(flags & KMC_NOMAGAZINE);
847 ASSERT0(flags & KMC_NOHASH);
848 ASSERT0(flags & KMC_QCACHE);
849 ASSERT(vmp == NULL);
850
851 might_sleep();
852
853 /*
854 * Allocate memory for a new cache an initialize it. Unfortunately,
855 * this usually ends up being a large allocation of ~32k because
856 * we need to allocate enough memory for the worst case number of
857 * cpus in the magazine, skc_mag[NR_CPUS]. Because of this we
858 * explicitly pass KM_NODEBUG to suppress the kmem warning
859 */
860 skc = kmem_zalloc(sizeof(*skc), KM_SLEEP| KM_NODEBUG);
861 if (skc == NULL)
862 return (NULL);
863
864 skc->skc_magic = SKC_MAGIC;
865 skc->skc_name_size = strlen(name) + 1;
866 skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, KM_SLEEP);
867 if (skc->skc_name == NULL) {
868 kmem_free(skc, sizeof(*skc));
869 return (NULL);
870 }
871 strncpy(skc->skc_name, name, skc->skc_name_size);
872
873 skc->skc_ctor = ctor;
874 skc->skc_dtor = dtor;
875 skc->skc_reclaim = reclaim;
876 skc->skc_private = priv;
877 skc->skc_vmp = vmp;
878 skc->skc_linux_cache = NULL;
879 skc->skc_flags = flags;
880 skc->skc_obj_size = size;
881 skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
882 skc->skc_delay = SPL_KMEM_CACHE_DELAY;
883 skc->skc_reap = SPL_KMEM_CACHE_REAP;
884 atomic_set(&skc->skc_ref, 0);
885
886 INIT_LIST_HEAD(&skc->skc_list);
887 INIT_LIST_HEAD(&skc->skc_complete_list);
888 INIT_LIST_HEAD(&skc->skc_partial_list);
889 skc->skc_emergency_tree = RB_ROOT;
890 spin_lock_init(&skc->skc_lock);
891 init_waitqueue_head(&skc->skc_waitq);
892 skc->skc_slab_fail = 0;
893 skc->skc_slab_create = 0;
894 skc->skc_slab_destroy = 0;
895 skc->skc_slab_total = 0;
896 skc->skc_slab_alloc = 0;
897 skc->skc_slab_max = 0;
898 skc->skc_obj_total = 0;
899 skc->skc_obj_alloc = 0;
900 skc->skc_obj_max = 0;
901 skc->skc_obj_deadlock = 0;
902 skc->skc_obj_emergency = 0;
903 skc->skc_obj_emergency_max = 0;
904
905 /*
906 * Verify the requested alignment restriction is sane.
907 */
908 if (align) {
909 VERIFY(ISP2(align));
910 VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN);
911 VERIFY3U(align, <=, PAGE_SIZE);
912 skc->skc_obj_align = align;
913 }
914
915 /*
916 * When no specific type of slab is requested (kmem, vmem, or
917 * linuxslab) then select a cache type based on the object size
918 * and default tunables.
919 */
920 if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB))) {
921
922 /*
923 * Objects smaller than spl_kmem_cache_slab_limit can
924 * use the Linux slab for better space-efficiency. By
925 * default this functionality is disabled until its
926 * performance characters are fully understood.
927 */
928 if (spl_kmem_cache_slab_limit &&
929 size <= (size_t)spl_kmem_cache_slab_limit)
930 skc->skc_flags |= KMC_SLAB;
931
932 /*
933 * Small objects, less than spl_kmem_cache_kmem_limit per
934 * object should use kmem because their slabs are small.
935 */
936 else if (spl_obj_size(skc) <= spl_kmem_cache_kmem_limit)
937 skc->skc_flags |= KMC_KMEM;
938
939 /*
940 * All other objects are considered large and are placed
941 * on vmem backed slabs.
942 */
943 else
944 skc->skc_flags |= KMC_VMEM;
945 }
946
947 /*
948 * Given the type of slab allocate the required resources.
949 */
950 if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
951 rc = spl_slab_size(skc,
952 &skc->skc_slab_objs, &skc->skc_slab_size);
953 if (rc)
954 goto out;
955
956 rc = spl_magazine_create(skc);
957 if (rc)
958 goto out;
959 } else {
960 skc->skc_linux_cache = kmem_cache_create(
961 skc->skc_name, size, align, 0, NULL);
962 if (skc->skc_linux_cache == NULL) {
963 rc = ENOMEM;
964 goto out;
965 }
966
967 kmem_cache_set_allocflags(skc, __GFP_COMP);
968 skc->skc_flags |= KMC_NOMAGAZINE;
969 }
970
971 if (spl_kmem_cache_expire & KMC_EXPIRE_AGE)
972 skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq,
973 spl_cache_age, skc, TQ_SLEEP,
974 ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
975
976 down_write(&spl_kmem_cache_sem);
977 list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
978 up_write(&spl_kmem_cache_sem);
979
980 return (skc);
981out:
982 kmem_free(skc->skc_name, skc->skc_name_size);
983 kmem_free(skc, sizeof(*skc));
984 return (NULL);
985}
986EXPORT_SYMBOL(spl_kmem_cache_create);
987
988/*
989 * Register a move callback to for cache defragmentation.
990 * XXX: Unimplemented but harmless to stub out for now.
991 */
992void
993spl_kmem_cache_set_move(spl_kmem_cache_t *skc,
994 kmem_cbrc_t (move)(void *, void *, size_t, void *))
995{
996 ASSERT(move != NULL);
997}
998EXPORT_SYMBOL(spl_kmem_cache_set_move);
999
1000/*
1001 * Destroy a cache and all objects associated with the cache.
1002 */
1003void
1004spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
1005{
1006 DECLARE_WAIT_QUEUE_HEAD(wq);
1007 taskqid_t id;
1008
1009 ASSERT(skc->skc_magic == SKC_MAGIC);
1010 ASSERT(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB));
1011
1012 down_write(&spl_kmem_cache_sem);
1013 list_del_init(&skc->skc_list);
1014 up_write(&spl_kmem_cache_sem);
1015
1016 /* Cancel any and wait for any pending delayed tasks */
1017 VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1018
1019 spin_lock(&skc->skc_lock);
1020 id = skc->skc_taskqid;
1021 spin_unlock(&skc->skc_lock);
1022
1023 taskq_cancel_id(spl_kmem_cache_taskq, id);
1024
1025 /* Wait until all current callers complete, this is mainly
1026 * to catch the case where a low memory situation triggers a
1027 * cache reaping action which races with this destroy. */
1028 wait_event(wq, atomic_read(&skc->skc_ref) == 0);
1029
1030 if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
1031 spl_magazine_destroy(skc);
1032 spl_slab_reclaim(skc, 0, 1);
1033 } else {
1034 ASSERT(skc->skc_flags & KMC_SLAB);
1035 kmem_cache_destroy(skc->skc_linux_cache);
1036 }
1037
1038 spin_lock(&skc->skc_lock);
1039
1040 /* Validate there are no objects in use and free all the
1041 * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */
1042 ASSERT3U(skc->skc_slab_alloc, ==, 0);
1043 ASSERT3U(skc->skc_obj_alloc, ==, 0);
1044 ASSERT3U(skc->skc_slab_total, ==, 0);
1045 ASSERT3U(skc->skc_obj_total, ==, 0);
1046 ASSERT3U(skc->skc_obj_emergency, ==, 0);
1047 ASSERT(list_empty(&skc->skc_complete_list));
1048
1049 kmem_free(skc->skc_name, skc->skc_name_size);
1050 spin_unlock(&skc->skc_lock);
1051
1052 kmem_free(skc, sizeof(*skc));
1053}
1054EXPORT_SYMBOL(spl_kmem_cache_destroy);
1055
1056/*
1057 * Allocate an object from a slab attached to the cache. This is used to
1058 * repopulate the per-cpu magazine caches in batches when they run low.
1059 */
1060static void *
1061spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
1062{
1063 spl_kmem_obj_t *sko;
1064
1065 ASSERT(skc->skc_magic == SKC_MAGIC);
1066 ASSERT(sks->sks_magic == SKS_MAGIC);
1067 ASSERT(spin_is_locked(&skc->skc_lock));
1068
1069 sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
1070 ASSERT(sko->sko_magic == SKO_MAGIC);
1071 ASSERT(sko->sko_addr != NULL);
1072
1073 /* Remove from sks_free_list */
1074 list_del_init(&sko->sko_list);
1075
1076 sks->sks_age = jiffies;
1077 sks->sks_ref++;
1078 skc->skc_obj_alloc++;
1079
1080 /* Track max obj usage statistics */
1081 if (skc->skc_obj_alloc > skc->skc_obj_max)
1082 skc->skc_obj_max = skc->skc_obj_alloc;
1083
1084 /* Track max slab usage statistics */
1085 if (sks->sks_ref == 1) {
1086 skc->skc_slab_alloc++;
1087
1088 if (skc->skc_slab_alloc > skc->skc_slab_max)
1089 skc->skc_slab_max = skc->skc_slab_alloc;
1090 }
1091
1092 return sko->sko_addr;
1093}
1094
1095/*
1096 * Generic slab allocation function to run by the global work queues.
1097 * It is responsible for allocating a new slab, linking it in to the list
1098 * of partial slabs, and then waking any waiters.
1099 */
1100static void
1101spl_cache_grow_work(void *data)
1102{
1103 spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
1104 spl_kmem_cache_t *skc = ska->ska_cache;
1105 spl_kmem_slab_t *sks;
1106
1107 sks = spl_slab_alloc(skc, ska->ska_flags | __GFP_NORETRY | KM_NODEBUG);
1108 spin_lock(&skc->skc_lock);
1109 if (sks) {
1110 skc->skc_slab_total++;
1111 skc->skc_obj_total += sks->sks_objs;
1112 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1113 }
1114
1115 atomic_dec(&skc->skc_ref);
1116 clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
1117 clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
1118 wake_up_all(&skc->skc_waitq);
1119 spin_unlock(&skc->skc_lock);
1120
1121 kfree(ska);
1122}
1123
1124/*
1125 * Returns non-zero when a new slab should be available.
1126 */
1127static int
1128spl_cache_grow_wait(spl_kmem_cache_t *skc)
1129{
1130 return !test_bit(KMC_BIT_GROWING, &skc->skc_flags);
1131}
1132
1133/*
1134 * No available objects on any slabs, create a new slab. Note that this
1135 * functionality is disabled for KMC_SLAB caches which are backed by the
1136 * Linux slab.
1137 */
1138static int
1139spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
1140{
1141 int remaining, rc;
1142
1143 ASSERT(skc->skc_magic == SKC_MAGIC);
1144 ASSERT((skc->skc_flags & KMC_SLAB) == 0);
1145 might_sleep();
1146 *obj = NULL;
1147
1148 /*
1149 * Before allocating a new slab wait for any reaping to complete and
1150 * then return so the local magazine can be rechecked for new objects.
1151 */
1152 if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
1153 rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,
1154 TASK_UNINTERRUPTIBLE);
1155 return (rc ? rc : -EAGAIN);
1156 }
1157
1158 /*
1159 * This is handled by dispatching a work request to the global work
1160 * queue. This allows us to asynchronously allocate a new slab while
1161 * retaining the ability to safely fall back to a smaller synchronous
1162 * allocations to ensure forward progress is always maintained.
1163 */
1164 if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
1165 spl_kmem_alloc_t *ska;
1166
1167 ska = kmalloc(sizeof(*ska), flags);
1168 if (ska == NULL) {
1169 clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
1170 wake_up_all(&skc->skc_waitq);
1171 return (-ENOMEM);
1172 }
1173
1174 atomic_inc(&skc->skc_ref);
1175 ska->ska_cache = skc;
1176 ska->ska_flags = flags & ~__GFP_FS;
1177 taskq_init_ent(&ska->ska_tqe);
1178 taskq_dispatch_ent(spl_kmem_cache_taskq,
1179 spl_cache_grow_work, ska, 0, &ska->ska_tqe);
1180 }
1181
1182 /*
1183 * The goal here is to only detect the rare case where a virtual slab
1184 * allocation has deadlocked. We must be careful to minimize the use
1185 * of emergency objects which are more expensive to track. Therefore,
1186 * we set a very long timeout for the asynchronous allocation and if
1187 * the timeout is reached the cache is flagged as deadlocked. From
1188 * this point only new emergency objects will be allocated until the
1189 * asynchronous allocation completes and clears the deadlocked flag.
1190 */
1191 if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {
1192 rc = spl_emergency_alloc(skc, flags, obj);
1193 } else {
1194 remaining = wait_event_timeout(skc->skc_waitq,
1195 spl_cache_grow_wait(skc), HZ);
1196
1197 if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) {
1198 spin_lock(&skc->skc_lock);
1199 if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
1200 set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
1201 skc->skc_obj_deadlock++;
1202 }
1203 spin_unlock(&skc->skc_lock);
1204 }
1205
1206 rc = -ENOMEM;
1207 }
1208
1209 return (rc);
1210}
1211
1212/*
1213 * Refill a per-cpu magazine with objects from the slabs for this cache.
1214 * Ideally the magazine can be repopulated using existing objects which have
1215 * been released, however if we are unable to locate enough free objects new
1216 * slabs of objects will be created. On success NULL is returned, otherwise
1217 * the address of a single emergency object is returned for use by the caller.
1218 */
1219static void *
1220spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
1221{
1222 spl_kmem_slab_t *sks;
1223 int count = 0, rc, refill;
1224 void *obj = NULL;
1225
1226 ASSERT(skc->skc_magic == SKC_MAGIC);
1227 ASSERT(skm->skm_magic == SKM_MAGIC);
1228
1229 refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
1230 spin_lock(&skc->skc_lock);
1231
1232 while (refill > 0) {
1233 /* No slabs available we may need to grow the cache */
1234 if (list_empty(&skc->skc_partial_list)) {
1235 spin_unlock(&skc->skc_lock);
1236
1237 local_irq_enable();
1238 rc = spl_cache_grow(skc, flags, &obj);
1239 local_irq_disable();
1240
1241 /* Emergency object for immediate use by caller */
1242 if (rc == 0 && obj != NULL)
1243 return (obj);
1244
1245 if (rc)
1246 goto out;
1247
1248 /* Rescheduled to different CPU skm is not local */
1249 if (skm != skc->skc_mag[smp_processor_id()])
1250 goto out;
1251
1252 /* Potentially rescheduled to the same CPU but
1253 * allocations may have occurred from this CPU while
1254 * we were sleeping so recalculate max refill. */
1255 refill = MIN(refill, skm->skm_size - skm->skm_avail);
1256
1257 spin_lock(&skc->skc_lock);
1258 continue;
1259 }
1260
1261 /* Grab the next available slab */
1262 sks = list_entry((&skc->skc_partial_list)->next,
1263 spl_kmem_slab_t, sks_list);
1264 ASSERT(sks->sks_magic == SKS_MAGIC);
1265 ASSERT(sks->sks_ref < sks->sks_objs);
1266 ASSERT(!list_empty(&sks->sks_free_list));
1267
1268 /* Consume as many objects as needed to refill the requested
1269 * cache. We must also be careful not to overfill it. */
1270 while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++count) {
1271 ASSERT(skm->skm_avail < skm->skm_size);
1272 ASSERT(count < skm->skm_size);
1273 skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
1274 }
1275
1276 /* Move slab to skc_complete_list when full */
1277 if (sks->sks_ref == sks->sks_objs) {
1278 list_del(&sks->sks_list);
1279 list_add(&sks->sks_list, &skc->skc_complete_list);
1280 }
1281 }
1282
1283 spin_unlock(&skc->skc_lock);
1284out:
1285 return (NULL);
1286}
1287
1288/*
1289 * Release an object back to the slab from which it came.
1290 */
1291static void
1292spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
1293{
1294 spl_kmem_slab_t *sks = NULL;
1295 spl_kmem_obj_t *sko = NULL;
1296
1297 ASSERT(skc->skc_magic == SKC_MAGIC);
1298 ASSERT(spin_is_locked(&skc->skc_lock));
1299
1300 sko = spl_sko_from_obj(skc, obj);
1301 ASSERT(sko->sko_magic == SKO_MAGIC);
1302 sks = sko->sko_slab;
1303 ASSERT(sks->sks_magic == SKS_MAGIC);
1304 ASSERT(sks->sks_cache == skc);
1305 list_add(&sko->sko_list, &sks->sks_free_list);
1306
1307 sks->sks_age = jiffies;
1308 sks->sks_ref--;
1309 skc->skc_obj_alloc--;
1310
1311 /* Move slab to skc_partial_list when no longer full. Slabs
1312 * are added to the head to keep the partial list is quasi-full
1313 * sorted order. Fuller at the head, emptier at the tail. */
1314 if (sks->sks_ref == (sks->sks_objs - 1)) {
1315 list_del(&sks->sks_list);
1316 list_add(&sks->sks_list, &skc->skc_partial_list);
1317 }
1318
1319 /* Move empty slabs to the end of the partial list so
1320 * they can be easily found and freed during reclamation. */
1321 if (sks->sks_ref == 0) {
1322 list_del(&sks->sks_list);
1323 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1324 skc->skc_slab_alloc--;
1325 }
1326}
1327
1328/*
1329 * Allocate an object from the per-cpu magazine, or if the magazine
1330 * is empty directly allocate from a slab and repopulate the magazine.
1331 */
1332void *
1333spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
1334{
1335 spl_kmem_magazine_t *skm;
1336 void *obj = NULL;
1337
1338 ASSERT(skc->skc_magic == SKC_MAGIC);
1339 ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1340 ASSERT(flags & KM_SLEEP);
1341
1342 atomic_inc(&skc->skc_ref);
1343
1344 /*
1345 * Allocate directly from a Linux slab. All optimizations are left
1346 * to the underlying cache we only need to guarantee that KM_SLEEP
1347 * callers will never fail.
1348 */
1349 if (skc->skc_flags & KMC_SLAB) {
1350 struct kmem_cache *slc = skc->skc_linux_cache;
1351
1352 do {
1353 obj = kmem_cache_alloc(slc, flags | __GFP_COMP);
1354 } while ((obj == NULL) && !(flags & KM_NOSLEEP));
1355
1356 goto ret;
1357 }
1358
1359 local_irq_disable();
1360
1361restart:
1362 /* Safe to update per-cpu structure without lock, but
1363 * in the restart case we must be careful to reacquire
1364 * the local magazine since this may have changed
1365 * when we need to grow the cache. */
1366 skm = skc->skc_mag[smp_processor_id()];
1367 ASSERT(skm->skm_magic == SKM_MAGIC);
1368
1369 if (likely(skm->skm_avail)) {
1370 /* Object available in CPU cache, use it */
1371 obj = skm->skm_objs[--skm->skm_avail];
1372 skm->skm_age = jiffies;
1373 } else {
1374 obj = spl_cache_refill(skc, skm, flags);
1375 if (obj == NULL)
1376 goto restart;
1377 }
1378
1379 local_irq_enable();
1380 ASSERT(obj);
1381 ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
1382
1383ret:
1384 /* Pre-emptively migrate object to CPU L1 cache */
1385 if (obj) {
1386 if (obj && skc->skc_ctor)
1387 skc->skc_ctor(obj, skc->skc_private, flags);
1388 else
1389 prefetchw(obj);
1390 }
1391
1392 atomic_dec(&skc->skc_ref);
1393
1394 return (obj);
1395}
1396
1397EXPORT_SYMBOL(spl_kmem_cache_alloc);
1398
1399/*
1400 * Free an object back to the local per-cpu magazine, there is no
1401 * guarantee that this is the same magazine the object was originally
1402 * allocated from. We may need to flush entire from the magazine
1403 * back to the slabs to make space.
1404 */
1405void
1406spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
1407{
1408 spl_kmem_magazine_t *skm;
1409 unsigned long flags;
1410
1411 ASSERT(skc->skc_magic == SKC_MAGIC);
1412 ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1413 atomic_inc(&skc->skc_ref);
1414
1415 /*
1416 * Run the destructor
1417 */
1418 if (skc->skc_dtor)
1419 skc->skc_dtor(obj, skc->skc_private);
1420
1421 /*
1422 * Free the object from the Linux underlying Linux slab.
1423 */
1424 if (skc->skc_flags & KMC_SLAB) {
1425 kmem_cache_free(skc->skc_linux_cache, obj);
1426 goto out;
1427 }
1428
1429 /*
1430 * Only virtual slabs may have emergency objects and these objects
1431 * are guaranteed to have physical addresses. They must be removed
1432 * from the tree of emergency objects and the freed.
1433 */
1434 if ((skc->skc_flags & KMC_VMEM) && !kmem_virt(obj)) {
1435 spl_emergency_free(skc, obj);
1436 goto out;
1437 }
1438
1439 local_irq_save(flags);
1440
1441 /* Safe to update per-cpu structure without lock, but
1442 * no remote memory allocation tracking is being performed
1443 * it is entirely possible to allocate an object from one
1444 * CPU cache and return it to another. */
1445 skm = skc->skc_mag[smp_processor_id()];
1446 ASSERT(skm->skm_magic == SKM_MAGIC);
1447
1448 /* Per-CPU cache full, flush it to make space */
1449 if (unlikely(skm->skm_avail >= skm->skm_size))
1450 spl_cache_flush(skc, skm, skm->skm_refill);
1451
1452 /* Available space in cache, use it */
1453 skm->skm_objs[skm->skm_avail++] = obj;
1454
1455 local_irq_restore(flags);
1456out:
1457 atomic_dec(&skc->skc_ref);
1458}
1459EXPORT_SYMBOL(spl_kmem_cache_free);
1460
1461/*
1462 * The generic shrinker function for all caches. Under Linux a shrinker
1463 * may not be tightly coupled with a slab cache. In fact Linux always
1464 * systematically tries calling all registered shrinker callbacks which
1465 * report that they contain unused objects. Because of this we only
1466 * register one shrinker function in the shim layer for all slab caches.
1467 * We always attempt to shrink all caches when this generic shrinker
1468 * is called.
1469 *
1470 * If sc->nr_to_scan is zero, the caller is requesting a query of the
1471 * number of objects which can potentially be freed. If it is nonzero,
1472 * the request is to free that many objects.
1473 *
1474 * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks
1475 * in struct shrinker and also require the shrinker to return the number
1476 * of objects freed.
1477 *
1478 * Older kernels require the shrinker to return the number of freeable
1479 * objects following the freeing of nr_to_free.
1480 *
1481 * Linux semantics differ from those under Solaris, which are to
1482 * free all available objects which may (and probably will) be more
1483 * objects than the requested nr_to_scan.
1484 */
1485static spl_shrinker_t
1486__spl_kmem_cache_generic_shrinker(struct shrinker *shrink,
1487 struct shrink_control *sc)
1488{
1489 spl_kmem_cache_t *skc;
1490 int alloc = 0;
1491
1492 down_read(&spl_kmem_cache_sem);
1493 list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
1494 if (sc->nr_to_scan) {
1495#ifdef HAVE_SPLIT_SHRINKER_CALLBACK
1496 uint64_t oldalloc = skc->skc_obj_alloc;
1497 spl_kmem_cache_reap_now(skc,
1498 MAX(sc->nr_to_scan >> fls64(skc->skc_slab_objs), 1));
1499 if (oldalloc > skc->skc_obj_alloc)
1500 alloc += oldalloc - skc->skc_obj_alloc;
1501#else
1502 spl_kmem_cache_reap_now(skc,
1503 MAX(sc->nr_to_scan >> fls64(skc->skc_slab_objs), 1));
1504 alloc += skc->skc_obj_alloc;
1505#endif /* HAVE_SPLIT_SHRINKER_CALLBACK */
1506 } else {
1507 /* Request to query number of freeable objects */
1508 alloc += skc->skc_obj_alloc;
1509 }
1510 }
1511 up_read(&spl_kmem_cache_sem);
1512
1513 /*
1514 * When KMC_RECLAIM_ONCE is set allow only a single reclaim pass.
1515 * This functionality only exists to work around a rare issue where
1516 * shrink_slabs() is repeatedly invoked by many cores causing the
1517 * system to thrash.
1518 */
1519 if ((spl_kmem_cache_reclaim & KMC_RECLAIM_ONCE) && sc->nr_to_scan)
1520 return (SHRINK_STOP);
1521
1522 return (MAX(alloc, 0));
1523}
1524
1525SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker);
1526
1527/*
1528 * Call the registered reclaim function for a cache. Depending on how
1529 * many and which objects are released it may simply repopulate the
1530 * local magazine which will then need to age-out. Objects which cannot
1531 * fit in the magazine we will be released back to their slabs which will
1532 * also need to age out before being release. This is all just best
1533 * effort and we do not want to thrash creating and destroying slabs.
1534 */
1535void
1536spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count)
1537{
1538 ASSERT(skc->skc_magic == SKC_MAGIC);
1539 ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1540
1541 atomic_inc(&skc->skc_ref);
1542
1543 /*
1544 * Execute the registered reclaim callback if it exists. The
1545 * per-cpu caches will be drained when is set KMC_EXPIRE_MEM.
1546 */
1547 if (skc->skc_flags & KMC_SLAB) {
1548 if (skc->skc_reclaim)
1549 skc->skc_reclaim(skc->skc_private);
1550
1551 if (spl_kmem_cache_expire & KMC_EXPIRE_MEM)
1552 kmem_cache_shrink(skc->skc_linux_cache);
1553
1554 goto out;
1555 }
1556
1557 /*
1558 * Prevent concurrent cache reaping when contended.
1559 */
1560 if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags))
1561 goto out;
1562
1563 /*
1564 * When a reclaim function is available it may be invoked repeatedly
1565 * until at least a single slab can be freed. This ensures that we
1566 * do free memory back to the system. This helps minimize the chance
1567 * of an OOM event when the bulk of memory is used by the slab.
1568 *
1569 * When free slabs are already available the reclaim callback will be
1570 * skipped. Additionally, if no forward progress is detected despite
1571 * a reclaim function the cache will be skipped to avoid deadlock.
1572 *
1573 * Longer term this would be the correct place to add the code which
1574 * repacks the slabs in order minimize fragmentation.
1575 */
1576 if (skc->skc_reclaim) {
1577 uint64_t objects = UINT64_MAX;
1578 int do_reclaim;
1579
1580 do {
1581 spin_lock(&skc->skc_lock);
1582 do_reclaim =
1583 (skc->skc_slab_total > 0) &&
1584 ((skc->skc_slab_total - skc->skc_slab_alloc) == 0) &&
1585 (skc->skc_obj_alloc < objects);
1586
1587 objects = skc->skc_obj_alloc;
1588 spin_unlock(&skc->skc_lock);
1589
1590 if (do_reclaim)
1591 skc->skc_reclaim(skc->skc_private);
1592
1593 } while (do_reclaim);
1594 }
1595
1596 /* Reclaim from the magazine then the slabs ignoring age and delay. */
1597 if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) {
1598 spl_kmem_magazine_t *skm;
1599 unsigned long irq_flags;
1600
1601 local_irq_save(irq_flags);
1602 skm = skc->skc_mag[smp_processor_id()];
1603 spl_cache_flush(skc, skm, skm->skm_avail);
1604 local_irq_restore(irq_flags);
1605 }
1606
1607 spl_slab_reclaim(skc, count, 1);
1608 clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
1609 smp_wmb();
1610 wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
1611out:
1612 atomic_dec(&skc->skc_ref);
1613}
1614EXPORT_SYMBOL(spl_kmem_cache_reap_now);
1615
1616/*
1617 * Reap all free slabs from all registered caches.
1618 */
1619void
1620spl_kmem_reap(void)
1621{
1622 struct shrink_control sc;
1623
1624 sc.nr_to_scan = KMC_REAP_CHUNK;
1625 sc.gfp_mask = GFP_KERNEL;
1626
1627 (void) __spl_kmem_cache_generic_shrinker(NULL, &sc);
1628}
1629EXPORT_SYMBOL(spl_kmem_reap);
1630
1631int
1632spl_kmem_cache_init(void)
1633{
1634 init_rwsem(&spl_kmem_cache_sem);
1635 INIT_LIST_HEAD(&spl_kmem_cache_list);
1636 spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
1637 1, maxclsyspri, 1, 32, TASKQ_PREPOPULATE);
1638 spl_register_shrinker(&spl_kmem_cache_shrinker);
1639
1640 return (0);
1641}
1642
1643void
1644spl_kmem_cache_fini(void)
1645{
1646 spl_unregister_shrinker(&spl_kmem_cache_shrinker);
1647 taskq_destroy(spl_kmem_cache_taskq);
1648}