]> git.proxmox.com Git - mirror_spl.git/blob - module/spl/spl-kmem-cache.c
Fix kmem cstyle issues
[mirror_spl.git] / module / spl / spl-kmem-cache.c
1 /*
2 * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
3 * Copyright (C) 2007 The Regents of the University of California.
4 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
5 * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
6 * UCRL-CODE-235197
7 *
8 * This file is part of the SPL, Solaris Porting Layer.
9 * For details, see <http://zfsonlinux.org/>.
10 *
11 * The SPL is free software; you can redistribute it and/or modify it
12 * under the terms of the GNU General Public License as published by the
13 * Free Software Foundation; either version 2 of the License, or (at your
14 * option) any later version.
15 *
16 * The SPL is distributed in the hope that it will be useful, but WITHOUT
17 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
18 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
19 * for more details.
20 *
21 * You should have received a copy of the GNU General Public License along
22 * with the SPL. If not, see <http://www.gnu.org/licenses/>.
23 */
24
25 #include <sys/kmem.h>
26 #include <sys/kmem_cache.h>
27 #include <sys/taskq.h>
28 #include <sys/timer.h>
29 #include <sys/vmem.h>
30 #include <linux/slab.h>
31 #include <linux/swap.h>
32 #include <linux/mm_compat.h>
33 #include <linux/wait_compat.h>
34
35 /*
36 * Within the scope of spl-kmem.c file the kmem_cache_* definitions
37 * are removed to allow access to the real Linux slab allocator.
38 */
39 #undef kmem_cache_destroy
40 #undef kmem_cache_create
41 #undef kmem_cache_alloc
42 #undef kmem_cache_free
43
44
45 /*
46 * Cache expiration was implemented because it was part of the default Solaris
47 * kmem_cache behavior. The idea is that per-cpu objects which haven't been
48 * accessed in several seconds should be returned to the cache. On the other
49 * hand Linux slabs never move objects back to the slabs unless there is
50 * memory pressure on the system. By default the Linux method is enabled
51 * because it has been shown to improve responsiveness on low memory systems.
52 * This policy may be changed by setting KMC_EXPIRE_AGE or KMC_EXPIRE_MEM.
53 */
54 unsigned int spl_kmem_cache_expire = KMC_EXPIRE_MEM;
55 EXPORT_SYMBOL(spl_kmem_cache_expire);
56 module_param(spl_kmem_cache_expire, uint, 0644);
57 MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)");
58
59 /*
60 * The default behavior is to report the number of objects remaining in the
61 * cache. This allows the Linux VM to repeatedly reclaim objects from the
62 * cache when memory is low satisfy other memory allocations. Alternately,
63 * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache
64 * is reclaimed. This may increase the likelihood of out of memory events.
65 */
66 unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */;
67 module_param(spl_kmem_cache_reclaim, uint, 0644);
68 MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)");
69
70 unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB;
71 module_param(spl_kmem_cache_obj_per_slab, uint, 0644);
72 MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab");
73
74 unsigned int spl_kmem_cache_obj_per_slab_min = SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN;
75 module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644);
76 MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min,
77 "Minimal number of objects per slab");
78
79 unsigned int spl_kmem_cache_max_size = 32;
80 module_param(spl_kmem_cache_max_size, uint, 0644);
81 MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB");
82
83 /*
84 * For small objects the Linux slab allocator should be used to make the most
85 * efficient use of the memory. However, large objects are not supported by
86 * the Linux slab and therefore the SPL implementation is preferred. A cutoff
87 * of 16K was determined to be optimal for architectures using 4K pages.
88 */
89 #if PAGE_SIZE == 4096
90 unsigned int spl_kmem_cache_slab_limit = 16384;
91 #else
92 unsigned int spl_kmem_cache_slab_limit = 0;
93 #endif
94 module_param(spl_kmem_cache_slab_limit, uint, 0644);
95 MODULE_PARM_DESC(spl_kmem_cache_slab_limit,
96 "Objects less than N bytes use the Linux slab");
97
98 unsigned int spl_kmem_cache_kmem_limit = (PAGE_SIZE / 4);
99 module_param(spl_kmem_cache_kmem_limit, uint, 0644);
100 MODULE_PARM_DESC(spl_kmem_cache_kmem_limit,
101 "Objects less than N bytes use the kmalloc");
102
103 /*
104 * Slab allocation interfaces
105 *
106 * While the Linux slab implementation was inspired by the Solaris
107 * implementation I cannot use it to emulate the Solaris APIs. I
108 * require two features which are not provided by the Linux slab.
109 *
110 * 1) Constructors AND destructors. Recent versions of the Linux
111 * kernel have removed support for destructors. This is a deal
112 * breaker for the SPL which contains particularly expensive
113 * initializers for mutex's, condition variables, etc. We also
114 * require a minimal level of cleanup for these data types unlike
115 * many Linux data types which do need to be explicitly destroyed.
116 *
117 * 2) Virtual address space backed slab. Callers of the Solaris slab
118 * expect it to work well for both small are very large allocations.
119 * Because of memory fragmentation the Linux slab which is backed
120 * by kmalloc'ed memory performs very badly when confronted with
121 * large numbers of large allocations. Basing the slab on the
122 * virtual address space removes the need for contiguous pages
123 * and greatly improve performance for large allocations.
124 *
125 * For these reasons, the SPL has its own slab implementation with
126 * the needed features. It is not as highly optimized as either the
127 * Solaris or Linux slabs, but it should get me most of what is
128 * needed until it can be optimized or obsoleted by another approach.
129 *
130 * One serious concern I do have about this method is the relatively
131 * small virtual address space on 32bit arches. This will seriously
132 * constrain the size of the slab caches and their performance.
133 *
134 * XXX: Improve the partial slab list by carefully maintaining a
135 * strict ordering of fullest to emptiest slabs based on
136 * the slab reference count. This guarantees that when freeing
137 * slabs back to the system we need only linearly traverse the
138 * last N slabs in the list to discover all the freeable slabs.
139 *
140 * XXX: NUMA awareness for optionally allocating memory close to a
141 * particular core. This can be advantageous if you know the slab
142 * object will be short lived and primarily accessed from one core.
143 *
144 * XXX: Slab coloring may also yield performance improvements and would
145 * be desirable to implement.
146 */
147
148 struct list_head spl_kmem_cache_list; /* List of caches */
149 struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
150 taskq_t *spl_kmem_cache_taskq; /* Task queue for ageing / reclaim */
151
152 static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj);
153
154 SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker);
155 SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker,
156 spl_kmem_cache_generic_shrinker, KMC_DEFAULT_SEEKS);
157
158 static void *
159 kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
160 {
161 void *ptr;
162
163 ASSERT(ISP2(size));
164
165 if (skc->skc_flags & KMC_KMEM)
166 ptr = (void *)__get_free_pages(flags | __GFP_COMP,
167 get_order(size));
168 else
169 ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
170
171 /* Resulting allocated memory will be page aligned */
172 ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
173
174 return (ptr);
175 }
176
177 static void
178 kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
179 {
180 ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
181 ASSERT(ISP2(size));
182
183 /*
184 * The Linux direct reclaim path uses this out of band value to
185 * determine if forward progress is being made. Normally this is
186 * incremented by kmem_freepages() which is part of the various
187 * Linux slab implementations. However, since we are using none
188 * of that infrastructure we are responsible for incrementing it.
189 */
190 if (current->reclaim_state)
191 current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
192
193 if (skc->skc_flags & KMC_KMEM)
194 free_pages((unsigned long)ptr, get_order(size));
195 else
196 vfree(ptr);
197 }
198
199 /*
200 * Required space for each aligned sks.
201 */
202 static inline uint32_t
203 spl_sks_size(spl_kmem_cache_t *skc)
204 {
205 return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t),
206 skc->skc_obj_align, uint32_t));
207 }
208
209 /*
210 * Required space for each aligned object.
211 */
212 static inline uint32_t
213 spl_obj_size(spl_kmem_cache_t *skc)
214 {
215 uint32_t align = skc->skc_obj_align;
216
217 return (P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) +
218 P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t));
219 }
220
221 /*
222 * Lookup the spl_kmem_object_t for an object given that object.
223 */
224 static inline spl_kmem_obj_t *
225 spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj)
226 {
227 return (obj + P2ROUNDUP_TYPED(skc->skc_obj_size,
228 skc->skc_obj_align, uint32_t));
229 }
230
231 /*
232 * Required space for each offslab object taking in to account alignment
233 * restrictions and the power-of-two requirement of kv_alloc().
234 */
235 static inline uint32_t
236 spl_offslab_size(spl_kmem_cache_t *skc)
237 {
238 return (1UL << (fls64(spl_obj_size(skc)) + 1));
239 }
240
241 /*
242 * It's important that we pack the spl_kmem_obj_t structure and the
243 * actual objects in to one large address space to minimize the number
244 * of calls to the allocator. It is far better to do a few large
245 * allocations and then subdivide it ourselves. Now which allocator
246 * we use requires balancing a few trade offs.
247 *
248 * For small objects we use kmem_alloc() because as long as you are
249 * only requesting a small number of pages (ideally just one) its cheap.
250 * However, when you start requesting multiple pages with kmem_alloc()
251 * it gets increasingly expensive since it requires contiguous pages.
252 * For this reason we shift to vmem_alloc() for slabs of large objects
253 * which removes the need for contiguous pages. We do not use
254 * vmem_alloc() in all cases because there is significant locking
255 * overhead in __get_vm_area_node(). This function takes a single
256 * global lock when acquiring an available virtual address range which
257 * serializes all vmem_alloc()'s for all slab caches. Using slightly
258 * different allocation functions for small and large objects should
259 * give us the best of both worlds.
260 *
261 * KMC_ONSLAB KMC_OFFSLAB
262 *
263 * +------------------------+ +-----------------+
264 * | spl_kmem_slab_t --+-+ | | spl_kmem_slab_t |---+-+
265 * | skc_obj_size <-+ | | +-----------------+ | |
266 * | spl_kmem_obj_t | | | |
267 * | skc_obj_size <---+ | +-----------------+ | |
268 * | spl_kmem_obj_t | | | skc_obj_size | <-+ |
269 * | ... v | | spl_kmem_obj_t | |
270 * +------------------------+ +-----------------+ v
271 */
272 static spl_kmem_slab_t *
273 spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
274 {
275 spl_kmem_slab_t *sks;
276 spl_kmem_obj_t *sko, *n;
277 void *base, *obj;
278 uint32_t obj_size, offslab_size = 0;
279 int i, rc = 0;
280
281 base = kv_alloc(skc, skc->skc_slab_size, flags);
282 if (base == NULL)
283 return (NULL);
284
285 sks = (spl_kmem_slab_t *)base;
286 sks->sks_magic = SKS_MAGIC;
287 sks->sks_objs = skc->skc_slab_objs;
288 sks->sks_age = jiffies;
289 sks->sks_cache = skc;
290 INIT_LIST_HEAD(&sks->sks_list);
291 INIT_LIST_HEAD(&sks->sks_free_list);
292 sks->sks_ref = 0;
293 obj_size = spl_obj_size(skc);
294
295 if (skc->skc_flags & KMC_OFFSLAB)
296 offslab_size = spl_offslab_size(skc);
297
298 for (i = 0; i < sks->sks_objs; i++) {
299 if (skc->skc_flags & KMC_OFFSLAB) {
300 obj = kv_alloc(skc, offslab_size, flags);
301 if (!obj) {
302 rc = -ENOMEM;
303 goto out;
304 }
305 } else {
306 obj = base + spl_sks_size(skc) + (i * obj_size);
307 }
308
309 ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
310 sko = spl_sko_from_obj(skc, obj);
311 sko->sko_addr = obj;
312 sko->sko_magic = SKO_MAGIC;
313 sko->sko_slab = sks;
314 INIT_LIST_HEAD(&sko->sko_list);
315 list_add_tail(&sko->sko_list, &sks->sks_free_list);
316 }
317
318 out:
319 if (rc) {
320 if (skc->skc_flags & KMC_OFFSLAB)
321 list_for_each_entry_safe(sko,
322 n, &sks->sks_free_list, sko_list)
323 kv_free(skc, sko->sko_addr, offslab_size);
324
325 kv_free(skc, base, skc->skc_slab_size);
326 sks = NULL;
327 }
328
329 return (sks);
330 }
331
332 /*
333 * Remove a slab from complete or partial list, it must be called with
334 * the 'skc->skc_lock' held but the actual free must be performed
335 * outside the lock to prevent deadlocking on vmem addresses.
336 */
337 static void
338 spl_slab_free(spl_kmem_slab_t *sks,
339 struct list_head *sks_list, struct list_head *sko_list)
340 {
341 spl_kmem_cache_t *skc;
342
343 ASSERT(sks->sks_magic == SKS_MAGIC);
344 ASSERT(sks->sks_ref == 0);
345
346 skc = sks->sks_cache;
347 ASSERT(skc->skc_magic == SKC_MAGIC);
348 ASSERT(spin_is_locked(&skc->skc_lock));
349
350 /*
351 * Update slab/objects counters in the cache, then remove the
352 * slab from the skc->skc_partial_list. Finally add the slab
353 * and all its objects in to the private work lists where the
354 * destructors will be called and the memory freed to the system.
355 */
356 skc->skc_obj_total -= sks->sks_objs;
357 skc->skc_slab_total--;
358 list_del(&sks->sks_list);
359 list_add(&sks->sks_list, sks_list);
360 list_splice_init(&sks->sks_free_list, sko_list);
361 }
362
363 /*
364 * Traverse all the partial slabs attached to a cache and free those
365 * which which are currently empty, and have not been touched for
366 * skc_delay seconds to avoid thrashing. The count argument is
367 * passed to optionally cap the number of slabs reclaimed, a count
368 * of zero means try and reclaim everything. When flag is set we
369 * always free an available slab regardless of age.
370 */
371 static void
372 spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
373 {
374 spl_kmem_slab_t *sks, *m;
375 spl_kmem_obj_t *sko, *n;
376 LIST_HEAD(sks_list);
377 LIST_HEAD(sko_list);
378 uint32_t size = 0;
379 int i = 0;
380
381 /*
382 * Move empty slabs and objects which have not been touched in
383 * skc_delay seconds on to private lists to be freed outside
384 * the spin lock. This delay time is important to avoid thrashing
385 * however when flag is set the delay will not be used.
386 */
387 spin_lock(&skc->skc_lock);
388 list_for_each_entry_safe_reverse(sks, m,
389 &skc->skc_partial_list, sks_list) {
390 /*
391 * All empty slabs are at the end of skc->skc_partial_list,
392 * therefore once a non-empty slab is found we can stop
393 * scanning. Additionally, stop when reaching the target
394 * reclaim 'count' if a non-zero threshold is given.
395 */
396 if ((sks->sks_ref > 0) || (count && i >= count))
397 break;
398
399 if (time_after(jiffies, sks->sks_age + skc->skc_delay * HZ) ||
400 flag) {
401 spl_slab_free(sks, &sks_list, &sko_list);
402 i++;
403 }
404 }
405 spin_unlock(&skc->skc_lock);
406
407 /*
408 * The following two loops ensure all the object destructors are
409 * run, any offslab objects are freed, and the slabs themselves
410 * are freed. This is all done outside the skc->skc_lock since
411 * this allows the destructor to sleep, and allows us to perform
412 * a conditional reschedule when a freeing a large number of
413 * objects and slabs back to the system.
414 */
415 if (skc->skc_flags & KMC_OFFSLAB)
416 size = spl_offslab_size(skc);
417
418 list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
419 ASSERT(sko->sko_magic == SKO_MAGIC);
420
421 if (skc->skc_flags & KMC_OFFSLAB)
422 kv_free(skc, sko->sko_addr, size);
423 }
424
425 list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
426 ASSERT(sks->sks_magic == SKS_MAGIC);
427 kv_free(skc, sks, skc->skc_slab_size);
428 }
429 }
430
431 static spl_kmem_emergency_t *
432 spl_emergency_search(struct rb_root *root, void *obj)
433 {
434 struct rb_node *node = root->rb_node;
435 spl_kmem_emergency_t *ske;
436 unsigned long address = (unsigned long)obj;
437
438 while (node) {
439 ske = container_of(node, spl_kmem_emergency_t, ske_node);
440
441 if (address < (unsigned long)ske->ske_obj)
442 node = node->rb_left;
443 else if (address > (unsigned long)ske->ske_obj)
444 node = node->rb_right;
445 else
446 return (ske);
447 }
448
449 return (NULL);
450 }
451
452 static int
453 spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
454 {
455 struct rb_node **new = &(root->rb_node), *parent = NULL;
456 spl_kmem_emergency_t *ske_tmp;
457 unsigned long address = (unsigned long)ske->ske_obj;
458
459 while (*new) {
460 ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
461
462 parent = *new;
463 if (address < (unsigned long)ske_tmp->ske_obj)
464 new = &((*new)->rb_left);
465 else if (address > (unsigned long)ske_tmp->ske_obj)
466 new = &((*new)->rb_right);
467 else
468 return (0);
469 }
470
471 rb_link_node(&ske->ske_node, parent, new);
472 rb_insert_color(&ske->ske_node, root);
473
474 return (1);
475 }
476
477 /*
478 * Allocate a single emergency object and track it in a red black tree.
479 */
480 static int
481 spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
482 {
483 spl_kmem_emergency_t *ske;
484 int empty;
485
486 /* Last chance use a partial slab if one now exists */
487 spin_lock(&skc->skc_lock);
488 empty = list_empty(&skc->skc_partial_list);
489 spin_unlock(&skc->skc_lock);
490 if (!empty)
491 return (-EEXIST);
492
493 ske = kmalloc(sizeof (*ske), flags);
494 if (ske == NULL)
495 return (-ENOMEM);
496
497 ske->ske_obj = kmalloc(skc->skc_obj_size, flags);
498 if (ske->ske_obj == NULL) {
499 kfree(ske);
500 return (-ENOMEM);
501 }
502
503 spin_lock(&skc->skc_lock);
504 empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
505 if (likely(empty)) {
506 skc->skc_obj_total++;
507 skc->skc_obj_emergency++;
508 if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
509 skc->skc_obj_emergency_max = skc->skc_obj_emergency;
510 }
511 spin_unlock(&skc->skc_lock);
512
513 if (unlikely(!empty)) {
514 kfree(ske->ske_obj);
515 kfree(ske);
516 return (-EINVAL);
517 }
518
519 *obj = ske->ske_obj;
520
521 return (0);
522 }
523
524 /*
525 * Locate the passed object in the red black tree and free it.
526 */
527 static int
528 spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
529 {
530 spl_kmem_emergency_t *ske;
531
532 spin_lock(&skc->skc_lock);
533 ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
534 if (likely(ske)) {
535 rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
536 skc->skc_obj_emergency--;
537 skc->skc_obj_total--;
538 }
539 spin_unlock(&skc->skc_lock);
540
541 if (unlikely(ske == NULL))
542 return (-ENOENT);
543
544 kfree(ske->ske_obj);
545 kfree(ske);
546
547 return (0);
548 }
549
550 /*
551 * Release objects from the per-cpu magazine back to their slab. The flush
552 * argument contains the max number of entries to remove from the magazine.
553 */
554 static void
555 __spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
556 {
557 int i, count = MIN(flush, skm->skm_avail);
558
559 ASSERT(skc->skc_magic == SKC_MAGIC);
560 ASSERT(skm->skm_magic == SKM_MAGIC);
561 ASSERT(spin_is_locked(&skc->skc_lock));
562
563 for (i = 0; i < count; i++)
564 spl_cache_shrink(skc, skm->skm_objs[i]);
565
566 skm->skm_avail -= count;
567 memmove(skm->skm_objs, &(skm->skm_objs[count]),
568 sizeof (void *) * skm->skm_avail);
569 }
570
571 static void
572 spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
573 {
574 spin_lock(&skc->skc_lock);
575 __spl_cache_flush(skc, skm, flush);
576 spin_unlock(&skc->skc_lock);
577 }
578
579 static void
580 spl_magazine_age(void *data)
581 {
582 spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
583 spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
584
585 ASSERT(skm->skm_magic == SKM_MAGIC);
586 ASSERT(skm->skm_cpu == smp_processor_id());
587 ASSERT(irqs_disabled());
588
589 /* There are no available objects or they are too young to age out */
590 if ((skm->skm_avail == 0) ||
591 time_before(jiffies, skm->skm_age + skc->skc_delay * HZ))
592 return;
593
594 /*
595 * Because we're executing in interrupt context we may have
596 * interrupted the holder of this lock. To avoid a potential
597 * deadlock return if the lock is contended.
598 */
599 if (!spin_trylock(&skc->skc_lock))
600 return;
601
602 __spl_cache_flush(skc, skm, skm->skm_refill);
603 spin_unlock(&skc->skc_lock);
604 }
605
606 /*
607 * Called regularly to keep a downward pressure on the cache.
608 *
609 * Objects older than skc->skc_delay seconds in the per-cpu magazines will
610 * be returned to the caches. This is done to prevent idle magazines from
611 * holding memory which could be better used elsewhere. The delay is
612 * present to prevent thrashing the magazine.
613 *
614 * The newly released objects may result in empty partial slabs. Those
615 * slabs should be released to the system. Otherwise moving the objects
616 * out of the magazines is just wasted work.
617 */
618 static void
619 spl_cache_age(void *data)
620 {
621 spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
622 taskqid_t id = 0;
623
624 ASSERT(skc->skc_magic == SKC_MAGIC);
625
626 /* Dynamically disabled at run time */
627 if (!(spl_kmem_cache_expire & KMC_EXPIRE_AGE))
628 return;
629
630 atomic_inc(&skc->skc_ref);
631
632 if (!(skc->skc_flags & KMC_NOMAGAZINE))
633 on_each_cpu(spl_magazine_age, skc, 1);
634
635 spl_slab_reclaim(skc, skc->skc_reap, 0);
636
637 while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) {
638 id = taskq_dispatch_delay(
639 spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP,
640 ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
641
642 /* Destroy issued after dispatch immediately cancel it */
643 if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id)
644 taskq_cancel_id(spl_kmem_cache_taskq, id);
645 }
646
647 spin_lock(&skc->skc_lock);
648 skc->skc_taskqid = id;
649 spin_unlock(&skc->skc_lock);
650
651 atomic_dec(&skc->skc_ref);
652 }
653
654 /*
655 * Size a slab based on the size of each aligned object plus spl_kmem_obj_t.
656 * When on-slab we want to target spl_kmem_cache_obj_per_slab. However,
657 * for very small objects we may end up with more than this so as not
658 * to waste space in the minimal allocation of a single page. Also for
659 * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min,
660 * lower than this and we will fail.
661 */
662 static int
663 spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
664 {
665 uint32_t sks_size, obj_size, max_size;
666
667 if (skc->skc_flags & KMC_OFFSLAB) {
668 *objs = spl_kmem_cache_obj_per_slab;
669 *size = P2ROUNDUP(sizeof (spl_kmem_slab_t), PAGE_SIZE);
670 return (0);
671 } else {
672 sks_size = spl_sks_size(skc);
673 obj_size = spl_obj_size(skc);
674
675 if (skc->skc_flags & KMC_KMEM)
676 max_size = ((uint32_t)1 << (MAX_ORDER-3)) * PAGE_SIZE;
677 else
678 max_size = (spl_kmem_cache_max_size * 1024 * 1024);
679
680 /* Power of two sized slab */
681 for (*size = PAGE_SIZE; *size <= max_size; *size *= 2) {
682 *objs = (*size - sks_size) / obj_size;
683 if (*objs >= spl_kmem_cache_obj_per_slab)
684 return (0);
685 }
686
687 /*
688 * Unable to satisfy target objects per slab, fall back to
689 * allocating a maximally sized slab and assuming it can
690 * contain the minimum objects count use it. If not fail.
691 */
692 *size = max_size;
693 *objs = (*size - sks_size) / obj_size;
694 if (*objs >= (spl_kmem_cache_obj_per_slab_min))
695 return (0);
696 }
697
698 return (-ENOSPC);
699 }
700
701 /*
702 * Make a guess at reasonable per-cpu magazine size based on the size of
703 * each object and the cost of caching N of them in each magazine. Long
704 * term this should really adapt based on an observed usage heuristic.
705 */
706 static int
707 spl_magazine_size(spl_kmem_cache_t *skc)
708 {
709 uint32_t obj_size = spl_obj_size(skc);
710 int size;
711
712 /* Per-magazine sizes below assume a 4Kib page size */
713 if (obj_size > (PAGE_SIZE * 256))
714 size = 4; /* Minimum 4Mib per-magazine */
715 else if (obj_size > (PAGE_SIZE * 32))
716 size = 16; /* Minimum 2Mib per-magazine */
717 else if (obj_size > (PAGE_SIZE))
718 size = 64; /* Minimum 256Kib per-magazine */
719 else if (obj_size > (PAGE_SIZE / 4))
720 size = 128; /* Minimum 128Kib per-magazine */
721 else
722 size = 256;
723
724 return (size);
725 }
726
727 /*
728 * Allocate a per-cpu magazine to associate with a specific core.
729 */
730 static spl_kmem_magazine_t *
731 spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu)
732 {
733 spl_kmem_magazine_t *skm;
734 int size = sizeof (spl_kmem_magazine_t) +
735 sizeof (void *) * skc->skc_mag_size;
736
737 skm = kmem_alloc_node(size, KM_SLEEP, cpu_to_node(cpu));
738 if (skm) {
739 skm->skm_magic = SKM_MAGIC;
740 skm->skm_avail = 0;
741 skm->skm_size = skc->skc_mag_size;
742 skm->skm_refill = skc->skc_mag_refill;
743 skm->skm_cache = skc;
744 skm->skm_age = jiffies;
745 skm->skm_cpu = cpu;
746 }
747
748 return (skm);
749 }
750
751 /*
752 * Free a per-cpu magazine associated with a specific core.
753 */
754 static void
755 spl_magazine_free(spl_kmem_magazine_t *skm)
756 {
757 int size = sizeof (spl_kmem_magazine_t) +
758 sizeof (void *) * skm->skm_size;
759
760 ASSERT(skm->skm_magic == SKM_MAGIC);
761 ASSERT(skm->skm_avail == 0);
762
763 kmem_free(skm, size);
764 }
765
766 /*
767 * Create all pre-cpu magazines of reasonable sizes.
768 */
769 static int
770 spl_magazine_create(spl_kmem_cache_t *skc)
771 {
772 int i;
773
774 if (skc->skc_flags & KMC_NOMAGAZINE)
775 return (0);
776
777 skc->skc_mag_size = spl_magazine_size(skc);
778 skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
779
780 for_each_online_cpu(i) {
781 skc->skc_mag[i] = spl_magazine_alloc(skc, i);
782 if (!skc->skc_mag[i]) {
783 for (i--; i >= 0; i--)
784 spl_magazine_free(skc->skc_mag[i]);
785
786 return (-ENOMEM);
787 }
788 }
789
790 return (0);
791 }
792
793 /*
794 * Destroy all pre-cpu magazines.
795 */
796 static void
797 spl_magazine_destroy(spl_kmem_cache_t *skc)
798 {
799 spl_kmem_magazine_t *skm;
800 int i;
801
802 if (skc->skc_flags & KMC_NOMAGAZINE)
803 return;
804
805 for_each_online_cpu(i) {
806 skm = skc->skc_mag[i];
807 spl_cache_flush(skc, skm, skm->skm_avail);
808 spl_magazine_free(skm);
809 }
810 }
811
812 /*
813 * Create a object cache based on the following arguments:
814 * name cache name
815 * size cache object size
816 * align cache object alignment
817 * ctor cache object constructor
818 * dtor cache object destructor
819 * reclaim cache object reclaim
820 * priv cache private data for ctor/dtor/reclaim
821 * vmp unused must be NULL
822 * flags
823 * KMC_NOTOUCH Disable cache object aging (unsupported)
824 * KMC_NODEBUG Disable debugging (unsupported)
825 * KMC_NOHASH Disable hashing (unsupported)
826 * KMC_QCACHE Disable qcache (unsupported)
827 * KMC_NOMAGAZINE Enabled for kmem/vmem, Disabled for Linux slab
828 * KMC_KMEM Force kmem backed cache
829 * KMC_VMEM Force vmem backed cache
830 * KMC_SLAB Force Linux slab backed cache
831 * KMC_OFFSLAB Locate objects off the slab
832 */
833 spl_kmem_cache_t *
834 spl_kmem_cache_create(char *name, size_t size, size_t align,
835 spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, spl_kmem_reclaim_t reclaim,
836 void *priv, void *vmp, int flags)
837 {
838 spl_kmem_cache_t *skc;
839 int rc;
840
841 /*
842 * Unsupported flags
843 */
844 ASSERT0(flags & KMC_NOMAGAZINE);
845 ASSERT0(flags & KMC_NOHASH);
846 ASSERT0(flags & KMC_QCACHE);
847 ASSERT(vmp == NULL);
848
849 might_sleep();
850
851 /*
852 * Allocate memory for a new cache and initialize it. Unfortunately,
853 * this usually ends up being a large allocation of ~32k because
854 * we need to allocate enough memory for the worst case number of
855 * cpus in the magazine, skc_mag[NR_CPUS]. Because of this we
856 * explicitly pass KM_NODEBUG to suppress the kmem warning
857 */
858 skc = kmem_zalloc(sizeof (*skc), KM_SLEEP| KM_NODEBUG);
859 if (skc == NULL)
860 return (NULL);
861
862 skc->skc_magic = SKC_MAGIC;
863 skc->skc_name_size = strlen(name) + 1;
864 skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, KM_SLEEP);
865 if (skc->skc_name == NULL) {
866 kmem_free(skc, sizeof (*skc));
867 return (NULL);
868 }
869 strncpy(skc->skc_name, name, skc->skc_name_size);
870
871 skc->skc_ctor = ctor;
872 skc->skc_dtor = dtor;
873 skc->skc_reclaim = reclaim;
874 skc->skc_private = priv;
875 skc->skc_vmp = vmp;
876 skc->skc_linux_cache = NULL;
877 skc->skc_flags = flags;
878 skc->skc_obj_size = size;
879 skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
880 skc->skc_delay = SPL_KMEM_CACHE_DELAY;
881 skc->skc_reap = SPL_KMEM_CACHE_REAP;
882 atomic_set(&skc->skc_ref, 0);
883
884 INIT_LIST_HEAD(&skc->skc_list);
885 INIT_LIST_HEAD(&skc->skc_complete_list);
886 INIT_LIST_HEAD(&skc->skc_partial_list);
887 skc->skc_emergency_tree = RB_ROOT;
888 spin_lock_init(&skc->skc_lock);
889 init_waitqueue_head(&skc->skc_waitq);
890 skc->skc_slab_fail = 0;
891 skc->skc_slab_create = 0;
892 skc->skc_slab_destroy = 0;
893 skc->skc_slab_total = 0;
894 skc->skc_slab_alloc = 0;
895 skc->skc_slab_max = 0;
896 skc->skc_obj_total = 0;
897 skc->skc_obj_alloc = 0;
898 skc->skc_obj_max = 0;
899 skc->skc_obj_deadlock = 0;
900 skc->skc_obj_emergency = 0;
901 skc->skc_obj_emergency_max = 0;
902
903 /*
904 * Verify the requested alignment restriction is sane.
905 */
906 if (align) {
907 VERIFY(ISP2(align));
908 VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN);
909 VERIFY3U(align, <=, PAGE_SIZE);
910 skc->skc_obj_align = align;
911 }
912
913 /*
914 * When no specific type of slab is requested (kmem, vmem, or
915 * linuxslab) then select a cache type based on the object size
916 * and default tunables.
917 */
918 if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB))) {
919
920 /*
921 * Objects smaller than spl_kmem_cache_slab_limit can
922 * use the Linux slab for better space-efficiency. By
923 * default this functionality is disabled until its
924 * performance characteristics are fully understood.
925 */
926 if (spl_kmem_cache_slab_limit &&
927 size <= (size_t)spl_kmem_cache_slab_limit)
928 skc->skc_flags |= KMC_SLAB;
929
930 /*
931 * Small objects, less than spl_kmem_cache_kmem_limit per
932 * object should use kmem because their slabs are small.
933 */
934 else if (spl_obj_size(skc) <= spl_kmem_cache_kmem_limit)
935 skc->skc_flags |= KMC_KMEM;
936
937 /*
938 * All other objects are considered large and are placed
939 * on vmem backed slabs.
940 */
941 else
942 skc->skc_flags |= KMC_VMEM;
943 }
944
945 /*
946 * Given the type of slab allocate the required resources.
947 */
948 if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
949 rc = spl_slab_size(skc,
950 &skc->skc_slab_objs, &skc->skc_slab_size);
951 if (rc)
952 goto out;
953
954 rc = spl_magazine_create(skc);
955 if (rc)
956 goto out;
957 } else {
958 skc->skc_linux_cache = kmem_cache_create(
959 skc->skc_name, size, align, 0, NULL);
960 if (skc->skc_linux_cache == NULL) {
961 rc = ENOMEM;
962 goto out;
963 }
964
965 kmem_cache_set_allocflags(skc, __GFP_COMP);
966 skc->skc_flags |= KMC_NOMAGAZINE;
967 }
968
969 if (spl_kmem_cache_expire & KMC_EXPIRE_AGE)
970 skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq,
971 spl_cache_age, skc, TQ_SLEEP,
972 ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
973
974 down_write(&spl_kmem_cache_sem);
975 list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
976 up_write(&spl_kmem_cache_sem);
977
978 return (skc);
979 out:
980 kmem_free(skc->skc_name, skc->skc_name_size);
981 kmem_free(skc, sizeof (*skc));
982 return (NULL);
983 }
984 EXPORT_SYMBOL(spl_kmem_cache_create);
985
986 /*
987 * Register a move callback for cache defragmentation.
988 * XXX: Unimplemented but harmless to stub out for now.
989 */
990 void
991 spl_kmem_cache_set_move(spl_kmem_cache_t *skc,
992 kmem_cbrc_t (move)(void *, void *, size_t, void *))
993 {
994 ASSERT(move != NULL);
995 }
996 EXPORT_SYMBOL(spl_kmem_cache_set_move);
997
998 /*
999 * Destroy a cache and all objects associated with the cache.
1000 */
1001 void
1002 spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
1003 {
1004 DECLARE_WAIT_QUEUE_HEAD(wq);
1005 taskqid_t id;
1006
1007 ASSERT(skc->skc_magic == SKC_MAGIC);
1008 ASSERT(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB));
1009
1010 down_write(&spl_kmem_cache_sem);
1011 list_del_init(&skc->skc_list);
1012 up_write(&spl_kmem_cache_sem);
1013
1014 /* Cancel any and wait for any pending delayed tasks */
1015 VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1016
1017 spin_lock(&skc->skc_lock);
1018 id = skc->skc_taskqid;
1019 spin_unlock(&skc->skc_lock);
1020
1021 taskq_cancel_id(spl_kmem_cache_taskq, id);
1022
1023 /*
1024 * Wait until all current callers complete, this is mainly
1025 * to catch the case where a low memory situation triggers a
1026 * cache reaping action which races with this destroy.
1027 */
1028 wait_event(wq, atomic_read(&skc->skc_ref) == 0);
1029
1030 if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
1031 spl_magazine_destroy(skc);
1032 spl_slab_reclaim(skc, 0, 1);
1033 } else {
1034 ASSERT(skc->skc_flags & KMC_SLAB);
1035 kmem_cache_destroy(skc->skc_linux_cache);
1036 }
1037
1038 spin_lock(&skc->skc_lock);
1039
1040 /*
1041 * Validate there are no objects in use and free all the
1042 * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers.
1043 */
1044 ASSERT3U(skc->skc_slab_alloc, ==, 0);
1045 ASSERT3U(skc->skc_obj_alloc, ==, 0);
1046 ASSERT3U(skc->skc_slab_total, ==, 0);
1047 ASSERT3U(skc->skc_obj_total, ==, 0);
1048 ASSERT3U(skc->skc_obj_emergency, ==, 0);
1049 ASSERT(list_empty(&skc->skc_complete_list));
1050
1051 kmem_free(skc->skc_name, skc->skc_name_size);
1052 spin_unlock(&skc->skc_lock);
1053
1054 kmem_free(skc, sizeof (*skc));
1055 }
1056 EXPORT_SYMBOL(spl_kmem_cache_destroy);
1057
1058 /*
1059 * Allocate an object from a slab attached to the cache. This is used to
1060 * repopulate the per-cpu magazine caches in batches when they run low.
1061 */
1062 static void *
1063 spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
1064 {
1065 spl_kmem_obj_t *sko;
1066
1067 ASSERT(skc->skc_magic == SKC_MAGIC);
1068 ASSERT(sks->sks_magic == SKS_MAGIC);
1069 ASSERT(spin_is_locked(&skc->skc_lock));
1070
1071 sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
1072 ASSERT(sko->sko_magic == SKO_MAGIC);
1073 ASSERT(sko->sko_addr != NULL);
1074
1075 /* Remove from sks_free_list */
1076 list_del_init(&sko->sko_list);
1077
1078 sks->sks_age = jiffies;
1079 sks->sks_ref++;
1080 skc->skc_obj_alloc++;
1081
1082 /* Track max obj usage statistics */
1083 if (skc->skc_obj_alloc > skc->skc_obj_max)
1084 skc->skc_obj_max = skc->skc_obj_alloc;
1085
1086 /* Track max slab usage statistics */
1087 if (sks->sks_ref == 1) {
1088 skc->skc_slab_alloc++;
1089
1090 if (skc->skc_slab_alloc > skc->skc_slab_max)
1091 skc->skc_slab_max = skc->skc_slab_alloc;
1092 }
1093
1094 return (sko->sko_addr);
1095 }
1096
1097 /*
1098 * Generic slab allocation function to run by the global work queues.
1099 * It is responsible for allocating a new slab, linking it in to the list
1100 * of partial slabs, and then waking any waiters.
1101 */
1102 static void
1103 spl_cache_grow_work(void *data)
1104 {
1105 spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
1106 spl_kmem_cache_t *skc = ska->ska_cache;
1107 spl_kmem_slab_t *sks;
1108
1109 sks = spl_slab_alloc(skc, ska->ska_flags | __GFP_NORETRY | KM_NODEBUG);
1110 spin_lock(&skc->skc_lock);
1111 if (sks) {
1112 skc->skc_slab_total++;
1113 skc->skc_obj_total += sks->sks_objs;
1114 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1115 }
1116
1117 atomic_dec(&skc->skc_ref);
1118 clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
1119 clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
1120 wake_up_all(&skc->skc_waitq);
1121 spin_unlock(&skc->skc_lock);
1122
1123 kfree(ska);
1124 }
1125
1126 /*
1127 * Returns non-zero when a new slab should be available.
1128 */
1129 static int
1130 spl_cache_grow_wait(spl_kmem_cache_t *skc)
1131 {
1132 return (!test_bit(KMC_BIT_GROWING, &skc->skc_flags));
1133 }
1134
1135 /*
1136 * No available objects on any slabs, create a new slab. Note that this
1137 * functionality is disabled for KMC_SLAB caches which are backed by the
1138 * Linux slab.
1139 */
1140 static int
1141 spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
1142 {
1143 int remaining, rc;
1144
1145 ASSERT(skc->skc_magic == SKC_MAGIC);
1146 ASSERT((skc->skc_flags & KMC_SLAB) == 0);
1147 might_sleep();
1148 *obj = NULL;
1149
1150 /*
1151 * Before allocating a new slab wait for any reaping to complete and
1152 * then return so the local magazine can be rechecked for new objects.
1153 */
1154 if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
1155 rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,
1156 TASK_UNINTERRUPTIBLE);
1157 return (rc ? rc : -EAGAIN);
1158 }
1159
1160 /*
1161 * This is handled by dispatching a work request to the global work
1162 * queue. This allows us to asynchronously allocate a new slab while
1163 * retaining the ability to safely fall back to a smaller synchronous
1164 * allocations to ensure forward progress is always maintained.
1165 */
1166 if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
1167 spl_kmem_alloc_t *ska;
1168
1169 ska = kmalloc(sizeof (*ska), flags);
1170 if (ska == NULL) {
1171 clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
1172 wake_up_all(&skc->skc_waitq);
1173 return (-ENOMEM);
1174 }
1175
1176 atomic_inc(&skc->skc_ref);
1177 ska->ska_cache = skc;
1178 ska->ska_flags = flags & ~__GFP_FS;
1179 taskq_init_ent(&ska->ska_tqe);
1180 taskq_dispatch_ent(spl_kmem_cache_taskq,
1181 spl_cache_grow_work, ska, 0, &ska->ska_tqe);
1182 }
1183
1184 /*
1185 * The goal here is to only detect the rare case where a virtual slab
1186 * allocation has deadlocked. We must be careful to minimize the use
1187 * of emergency objects which are more expensive to track. Therefore,
1188 * we set a very long timeout for the asynchronous allocation and if
1189 * the timeout is reached the cache is flagged as deadlocked. From
1190 * this point only new emergency objects will be allocated until the
1191 * asynchronous allocation completes and clears the deadlocked flag.
1192 */
1193 if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {
1194 rc = spl_emergency_alloc(skc, flags, obj);
1195 } else {
1196 remaining = wait_event_timeout(skc->skc_waitq,
1197 spl_cache_grow_wait(skc), HZ);
1198
1199 if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) {
1200 spin_lock(&skc->skc_lock);
1201 if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
1202 set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
1203 skc->skc_obj_deadlock++;
1204 }
1205 spin_unlock(&skc->skc_lock);
1206 }
1207
1208 rc = -ENOMEM;
1209 }
1210
1211 return (rc);
1212 }
1213
1214 /*
1215 * Refill a per-cpu magazine with objects from the slabs for this cache.
1216 * Ideally the magazine can be repopulated using existing objects which have
1217 * been released, however if we are unable to locate enough free objects new
1218 * slabs of objects will be created. On success NULL is returned, otherwise
1219 * the address of a single emergency object is returned for use by the caller.
1220 */
1221 static void *
1222 spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
1223 {
1224 spl_kmem_slab_t *sks;
1225 int count = 0, rc, refill;
1226 void *obj = NULL;
1227
1228 ASSERT(skc->skc_magic == SKC_MAGIC);
1229 ASSERT(skm->skm_magic == SKM_MAGIC);
1230
1231 refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
1232 spin_lock(&skc->skc_lock);
1233
1234 while (refill > 0) {
1235 /* No slabs available we may need to grow the cache */
1236 if (list_empty(&skc->skc_partial_list)) {
1237 spin_unlock(&skc->skc_lock);
1238
1239 local_irq_enable();
1240 rc = spl_cache_grow(skc, flags, &obj);
1241 local_irq_disable();
1242
1243 /* Emergency object for immediate use by caller */
1244 if (rc == 0 && obj != NULL)
1245 return (obj);
1246
1247 if (rc)
1248 goto out;
1249
1250 /* Rescheduled to different CPU skm is not local */
1251 if (skm != skc->skc_mag[smp_processor_id()])
1252 goto out;
1253
1254 /*
1255 * Potentially rescheduled to the same CPU but
1256 * allocations may have occurred from this CPU while
1257 * we were sleeping so recalculate max refill.
1258 */
1259 refill = MIN(refill, skm->skm_size - skm->skm_avail);
1260
1261 spin_lock(&skc->skc_lock);
1262 continue;
1263 }
1264
1265 /* Grab the next available slab */
1266 sks = list_entry((&skc->skc_partial_list)->next,
1267 spl_kmem_slab_t, sks_list);
1268 ASSERT(sks->sks_magic == SKS_MAGIC);
1269 ASSERT(sks->sks_ref < sks->sks_objs);
1270 ASSERT(!list_empty(&sks->sks_free_list));
1271
1272 /*
1273 * Consume as many objects as needed to refill the requested
1274 * cache. We must also be careful not to overfill it.
1275 */
1276 while (sks->sks_ref < sks->sks_objs && refill-- > 0 &&
1277 ++count) {
1278 ASSERT(skm->skm_avail < skm->skm_size);
1279 ASSERT(count < skm->skm_size);
1280 skm->skm_objs[skm->skm_avail++] =
1281 spl_cache_obj(skc, sks);
1282 }
1283
1284 /* Move slab to skc_complete_list when full */
1285 if (sks->sks_ref == sks->sks_objs) {
1286 list_del(&sks->sks_list);
1287 list_add(&sks->sks_list, &skc->skc_complete_list);
1288 }
1289 }
1290
1291 spin_unlock(&skc->skc_lock);
1292 out:
1293 return (NULL);
1294 }
1295
1296 /*
1297 * Release an object back to the slab from which it came.
1298 */
1299 static void
1300 spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
1301 {
1302 spl_kmem_slab_t *sks = NULL;
1303 spl_kmem_obj_t *sko = NULL;
1304
1305 ASSERT(skc->skc_magic == SKC_MAGIC);
1306 ASSERT(spin_is_locked(&skc->skc_lock));
1307
1308 sko = spl_sko_from_obj(skc, obj);
1309 ASSERT(sko->sko_magic == SKO_MAGIC);
1310 sks = sko->sko_slab;
1311 ASSERT(sks->sks_magic == SKS_MAGIC);
1312 ASSERT(sks->sks_cache == skc);
1313 list_add(&sko->sko_list, &sks->sks_free_list);
1314
1315 sks->sks_age = jiffies;
1316 sks->sks_ref--;
1317 skc->skc_obj_alloc--;
1318
1319 /*
1320 * Move slab to skc_partial_list when no longer full. Slabs
1321 * are added to the head to keep the partial list is quasi-full
1322 * sorted order. Fuller at the head, emptier at the tail.
1323 */
1324 if (sks->sks_ref == (sks->sks_objs - 1)) {
1325 list_del(&sks->sks_list);
1326 list_add(&sks->sks_list, &skc->skc_partial_list);
1327 }
1328
1329 /*
1330 * Move empty slabs to the end of the partial list so
1331 * they can be easily found and freed during reclamation.
1332 */
1333 if (sks->sks_ref == 0) {
1334 list_del(&sks->sks_list);
1335 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1336 skc->skc_slab_alloc--;
1337 }
1338 }
1339
1340 /*
1341 * Allocate an object from the per-cpu magazine, or if the magazine
1342 * is empty directly allocate from a slab and repopulate the magazine.
1343 */
1344 void *
1345 spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
1346 {
1347 spl_kmem_magazine_t *skm;
1348 void *obj = NULL;
1349
1350 ASSERT(skc->skc_magic == SKC_MAGIC);
1351 ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1352 ASSERT(flags & KM_SLEEP);
1353
1354 atomic_inc(&skc->skc_ref);
1355
1356 /*
1357 * Allocate directly from a Linux slab. All optimizations are left
1358 * to the underlying cache we only need to guarantee that KM_SLEEP
1359 * callers will never fail.
1360 */
1361 if (skc->skc_flags & KMC_SLAB) {
1362 struct kmem_cache *slc = skc->skc_linux_cache;
1363
1364 do {
1365 obj = kmem_cache_alloc(slc, flags | __GFP_COMP);
1366 } while ((obj == NULL) && !(flags & KM_NOSLEEP));
1367
1368 goto ret;
1369 }
1370
1371 local_irq_disable();
1372
1373 restart:
1374 /*
1375 * Safe to update per-cpu structure without lock, but
1376 * in the restart case we must be careful to reacquire
1377 * the local magazine since this may have changed
1378 * when we need to grow the cache.
1379 */
1380 skm = skc->skc_mag[smp_processor_id()];
1381 ASSERT(skm->skm_magic == SKM_MAGIC);
1382
1383 if (likely(skm->skm_avail)) {
1384 /* Object available in CPU cache, use it */
1385 obj = skm->skm_objs[--skm->skm_avail];
1386 skm->skm_age = jiffies;
1387 } else {
1388 obj = spl_cache_refill(skc, skm, flags);
1389 if (obj == NULL)
1390 goto restart;
1391 }
1392
1393 local_irq_enable();
1394 ASSERT(obj);
1395 ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
1396
1397 ret:
1398 /* Pre-emptively migrate object to CPU L1 cache */
1399 if (obj) {
1400 if (obj && skc->skc_ctor)
1401 skc->skc_ctor(obj, skc->skc_private, flags);
1402 else
1403 prefetchw(obj);
1404 }
1405
1406 atomic_dec(&skc->skc_ref);
1407
1408 return (obj);
1409 }
1410
1411 EXPORT_SYMBOL(spl_kmem_cache_alloc);
1412
1413 /*
1414 * Free an object back to the local per-cpu magazine, there is no
1415 * guarantee that this is the same magazine the object was originally
1416 * allocated from. We may need to flush entire from the magazine
1417 * back to the slabs to make space.
1418 */
1419 void
1420 spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
1421 {
1422 spl_kmem_magazine_t *skm;
1423 unsigned long flags;
1424
1425 ASSERT(skc->skc_magic == SKC_MAGIC);
1426 ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1427 atomic_inc(&skc->skc_ref);
1428
1429 /*
1430 * Run the destructor
1431 */
1432 if (skc->skc_dtor)
1433 skc->skc_dtor(obj, skc->skc_private);
1434
1435 /*
1436 * Free the object from the Linux underlying Linux slab.
1437 */
1438 if (skc->skc_flags & KMC_SLAB) {
1439 kmem_cache_free(skc->skc_linux_cache, obj);
1440 goto out;
1441 }
1442
1443 /*
1444 * Only virtual slabs may have emergency objects and these objects
1445 * are guaranteed to have physical addresses. They must be removed
1446 * from the tree of emergency objects and the freed.
1447 */
1448 if ((skc->skc_flags & KMC_VMEM) && !kmem_virt(obj)) {
1449 spl_emergency_free(skc, obj);
1450 goto out;
1451 }
1452
1453 local_irq_save(flags);
1454
1455 /*
1456 * Safe to update per-cpu structure without lock, but
1457 * no remote memory allocation tracking is being performed
1458 * it is entirely possible to allocate an object from one
1459 * CPU cache and return it to another.
1460 */
1461 skm = skc->skc_mag[smp_processor_id()];
1462 ASSERT(skm->skm_magic == SKM_MAGIC);
1463
1464 /* Per-CPU cache full, flush it to make space */
1465 if (unlikely(skm->skm_avail >= skm->skm_size))
1466 spl_cache_flush(skc, skm, skm->skm_refill);
1467
1468 /* Available space in cache, use it */
1469 skm->skm_objs[skm->skm_avail++] = obj;
1470
1471 local_irq_restore(flags);
1472 out:
1473 atomic_dec(&skc->skc_ref);
1474 }
1475 EXPORT_SYMBOL(spl_kmem_cache_free);
1476
1477 /*
1478 * The generic shrinker function for all caches. Under Linux a shrinker
1479 * may not be tightly coupled with a slab cache. In fact Linux always
1480 * systematically tries calling all registered shrinker callbacks which
1481 * report that they contain unused objects. Because of this we only
1482 * register one shrinker function in the shim layer for all slab caches.
1483 * We always attempt to shrink all caches when this generic shrinker
1484 * is called.
1485 *
1486 * If sc->nr_to_scan is zero, the caller is requesting a query of the
1487 * number of objects which can potentially be freed. If it is nonzero,
1488 * the request is to free that many objects.
1489 *
1490 * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks
1491 * in struct shrinker and also require the shrinker to return the number
1492 * of objects freed.
1493 *
1494 * Older kernels require the shrinker to return the number of freeable
1495 * objects following the freeing of nr_to_free.
1496 *
1497 * Linux semantics differ from those under Solaris, which are to
1498 * free all available objects which may (and probably will) be more
1499 * objects than the requested nr_to_scan.
1500 */
1501 static spl_shrinker_t
1502 __spl_kmem_cache_generic_shrinker(struct shrinker *shrink,
1503 struct shrink_control *sc)
1504 {
1505 spl_kmem_cache_t *skc;
1506 int alloc = 0;
1507
1508 down_read(&spl_kmem_cache_sem);
1509 list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
1510 if (sc->nr_to_scan) {
1511 #ifdef HAVE_SPLIT_SHRINKER_CALLBACK
1512 uint64_t oldalloc = skc->skc_obj_alloc;
1513 spl_kmem_cache_reap_now(skc,
1514 MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1));
1515 if (oldalloc > skc->skc_obj_alloc)
1516 alloc += oldalloc - skc->skc_obj_alloc;
1517 #else
1518 spl_kmem_cache_reap_now(skc,
1519 MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1));
1520 alloc += skc->skc_obj_alloc;
1521 #endif /* HAVE_SPLIT_SHRINKER_CALLBACK */
1522 } else {
1523 /* Request to query number of freeable objects */
1524 alloc += skc->skc_obj_alloc;
1525 }
1526 }
1527 up_read(&spl_kmem_cache_sem);
1528
1529 /*
1530 * When KMC_RECLAIM_ONCE is set allow only a single reclaim pass.
1531 * This functionality only exists to work around a rare issue where
1532 * shrink_slabs() is repeatedly invoked by many cores causing the
1533 * system to thrash.
1534 */
1535 if ((spl_kmem_cache_reclaim & KMC_RECLAIM_ONCE) && sc->nr_to_scan)
1536 return (SHRINK_STOP);
1537
1538 return (MAX(alloc, 0));
1539 }
1540
1541 SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker);
1542
1543 /*
1544 * Call the registered reclaim function for a cache. Depending on how
1545 * many and which objects are released it may simply repopulate the
1546 * local magazine which will then need to age-out. Objects which cannot
1547 * fit in the magazine we will be released back to their slabs which will
1548 * also need to age out before being release. This is all just best
1549 * effort and we do not want to thrash creating and destroying slabs.
1550 */
1551 void
1552 spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count)
1553 {
1554 ASSERT(skc->skc_magic == SKC_MAGIC);
1555 ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1556
1557 atomic_inc(&skc->skc_ref);
1558
1559 /*
1560 * Execute the registered reclaim callback if it exists. The
1561 * per-cpu caches will be drained when is set KMC_EXPIRE_MEM.
1562 */
1563 if (skc->skc_flags & KMC_SLAB) {
1564 if (skc->skc_reclaim)
1565 skc->skc_reclaim(skc->skc_private);
1566
1567 if (spl_kmem_cache_expire & KMC_EXPIRE_MEM)
1568 kmem_cache_shrink(skc->skc_linux_cache);
1569
1570 goto out;
1571 }
1572
1573 /*
1574 * Prevent concurrent cache reaping when contended.
1575 */
1576 if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags))
1577 goto out;
1578
1579 /*
1580 * When a reclaim function is available it may be invoked repeatedly
1581 * until at least a single slab can be freed. This ensures that we
1582 * do free memory back to the system. This helps minimize the chance
1583 * of an OOM event when the bulk of memory is used by the slab.
1584 *
1585 * When free slabs are already available the reclaim callback will be
1586 * skipped. Additionally, if no forward progress is detected despite
1587 * a reclaim function the cache will be skipped to avoid deadlock.
1588 *
1589 * Longer term this would be the correct place to add the code which
1590 * repacks the slabs in order minimize fragmentation.
1591 */
1592 if (skc->skc_reclaim) {
1593 uint64_t objects = UINT64_MAX;
1594 int do_reclaim;
1595
1596 do {
1597 spin_lock(&skc->skc_lock);
1598 do_reclaim =
1599 (skc->skc_slab_total > 0) &&
1600 ((skc->skc_slab_total-skc->skc_slab_alloc) == 0) &&
1601 (skc->skc_obj_alloc < objects);
1602
1603 objects = skc->skc_obj_alloc;
1604 spin_unlock(&skc->skc_lock);
1605
1606 if (do_reclaim)
1607 skc->skc_reclaim(skc->skc_private);
1608
1609 } while (do_reclaim);
1610 }
1611
1612 /* Reclaim from the magazine then the slabs ignoring age and delay. */
1613 if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) {
1614 spl_kmem_magazine_t *skm;
1615 unsigned long irq_flags;
1616
1617 local_irq_save(irq_flags);
1618 skm = skc->skc_mag[smp_processor_id()];
1619 spl_cache_flush(skc, skm, skm->skm_avail);
1620 local_irq_restore(irq_flags);
1621 }
1622
1623 spl_slab_reclaim(skc, count, 1);
1624 clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
1625 smp_wmb();
1626 wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
1627 out:
1628 atomic_dec(&skc->skc_ref);
1629 }
1630 EXPORT_SYMBOL(spl_kmem_cache_reap_now);
1631
1632 /*
1633 * Reap all free slabs from all registered caches.
1634 */
1635 void
1636 spl_kmem_reap(void)
1637 {
1638 struct shrink_control sc;
1639
1640 sc.nr_to_scan = KMC_REAP_CHUNK;
1641 sc.gfp_mask = GFP_KERNEL;
1642
1643 (void) __spl_kmem_cache_generic_shrinker(NULL, &sc);
1644 }
1645 EXPORT_SYMBOL(spl_kmem_reap);
1646
1647 int
1648 spl_kmem_cache_init(void)
1649 {
1650 init_rwsem(&spl_kmem_cache_sem);
1651 INIT_LIST_HEAD(&spl_kmem_cache_list);
1652 spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
1653 1, maxclsyspri, 1, 32, TASKQ_PREPOPULATE);
1654 spl_register_shrinker(&spl_kmem_cache_shrinker);
1655
1656 return (0);
1657 }
1658
1659 void
1660 spl_kmem_cache_fini(void)
1661 {
1662 spl_unregister_shrinker(&spl_kmem_cache_shrinker);
1663 taskq_destroy(spl_kmem_cache_taskq);
1664 }