]> git.proxmox.com Git - mirror_spl-debian.git/blame - modules/spl/spl-kmem.c
Apply a nice fix caught by Ricardo,
[mirror_spl-debian.git] / modules / spl / spl-kmem.c
CommitLineData
715f6251 1/*
2 * This file is part of the SPL: Solaris Porting Layer.
3 *
4 * Copyright (c) 2008 Lawrence Livermore National Security, LLC.
5 * Produced at Lawrence Livermore National Laboratory
6 * Written by:
7 * Brian Behlendorf <behlendorf1@llnl.gov>,
8 * Herb Wartens <wartens2@llnl.gov>,
9 * Jim Garlick <garlick@llnl.gov>
10 * UCRL-CODE-235197
11 *
12 * This is free software; you can redistribute it and/or modify it
13 * under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This is distributed in the hope that it will be useful, but WITHOUT
18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
20 * for more details.
21 *
22 * You should have received a copy of the GNU General Public License along
23 * with this program; if not, write to the Free Software Foundation, Inc.,
24 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 */
26
f4b37741 27#include <sys/kmem.h>
f1ca4da6 28
937879f1 29#ifdef DEBUG_SUBSYSTEM
30#undef DEBUG_SUBSYSTEM
31#endif
32
33#define DEBUG_SUBSYSTEM S_KMEM
34
f1ca4da6 35/*
2fb9b26a 36 * Memory allocation interfaces and debugging for basic kmem_*
37 * and vmem_* style memory allocation. When DEBUG_KMEM is enable
38 * all allocations will be tracked when they are allocated and
39 * freed. When the SPL module is unload a list of all leaked
40 * addresses and where they were allocated will be dumped to the
41 * console. Enabling this feature has a significant impant on
42 * performance but it makes finding memory leaks staight forward.
f1ca4da6 43 */
44#ifdef DEBUG_KMEM
45/* Shim layer memory accounting */
c19c06f3 46atomic64_t kmem_alloc_used;
47unsigned long kmem_alloc_max = 0;
48atomic64_t vmem_alloc_used;
49unsigned long vmem_alloc_max = 0;
50int kmem_warning_flag = 1;
79b31f36 51
ff449ac4 52EXPORT_SYMBOL(kmem_alloc_used);
53EXPORT_SYMBOL(kmem_alloc_max);
54EXPORT_SYMBOL(vmem_alloc_used);
55EXPORT_SYMBOL(vmem_alloc_max);
56EXPORT_SYMBOL(kmem_warning_flag);
57
58#ifdef DEBUG_KMEM_TRACKING
d6a26c6a 59spinlock_t kmem_lock;
60struct hlist_head kmem_table[KMEM_TABLE_SIZE];
61struct list_head kmem_list;
62
13cdca65 63spinlock_t vmem_lock;
64struct hlist_head vmem_table[VMEM_TABLE_SIZE];
65struct list_head vmem_list;
66
d6a26c6a 67EXPORT_SYMBOL(kmem_lock);
68EXPORT_SYMBOL(kmem_table);
69EXPORT_SYMBOL(kmem_list);
70
13cdca65 71EXPORT_SYMBOL(vmem_lock);
72EXPORT_SYMBOL(vmem_table);
73EXPORT_SYMBOL(vmem_list);
ff449ac4 74#endif
13cdca65 75
c19c06f3 76int kmem_set_warning(int flag) { return (kmem_warning_flag = !!flag); }
77#else
78int kmem_set_warning(int flag) { return 0; }
f1ca4da6 79#endif
c19c06f3 80EXPORT_SYMBOL(kmem_set_warning);
f1ca4da6 81
82/*
83 * Slab allocation interfaces
84 *
2fb9b26a 85 * While the Linux slab implementation was inspired by the Solaris
86 * implemenation I cannot use it to emulate the Solaris APIs. I
87 * require two features which are not provided by the Linux slab.
88 *
89 * 1) Constructors AND destructors. Recent versions of the Linux
90 * kernel have removed support for destructors. This is a deal
91 * breaker for the SPL which contains particularly expensive
92 * initializers for mutex's, condition variables, etc. We also
93 * require a minimal level of cleaner for these data types unlike
94 * may Linux data type which do need to be explicitly destroyed.
95 *
96 * 2) Virtual address backed slab. Callers of the Solaris slab
97 * expect it to work well for both small are very large allocations.
98 * Because of memory fragmentation the Linux slab which is backed
99 * by kmalloc'ed memory performs very badly when confronted with
100 * large numbers of large allocations. Basing the slab on the
101 * virtual address space removes the need for contigeous pages
102 * and greatly improve performance for large allocations.
103 *
104 * For these reasons, the SPL has its own slab implementation with
105 * the needed features. It is not as highly optimized as either the
106 * Solaris or Linux slabs, but it should get me most of what is
107 * needed until it can be optimized or obsoleted by another approach.
108 *
109 * One serious concern I do have about this method is the relatively
110 * small virtual address space on 32bit arches. This will seriously
111 * constrain the size of the slab caches and their performance.
112 *
2fb9b26a 113 * XXX: Implement work requests to keep an eye on each cache and
4afaaefa 114 * shrink them via spl_slab_reclaim() when they are wasting lots
2fb9b26a 115 * of space. Currently this process is driven by the reapers.
116 *
2fb9b26a 117 * XXX: Improve the partial slab list by carefully maintaining a
118 * strict ordering of fullest to emptiest slabs based on
119 * the slab reference count. This gaurentees the when freeing
120 * slabs back to the system we need only linearly traverse the
121 * last N slabs in the list to discover all the freeable slabs.
122 *
123 * XXX: NUMA awareness for optionally allocating memory close to a
124 * particular core. This can be adventageous if you know the slab
125 * object will be short lived and primarily accessed from one core.
126 *
127 * XXX: Slab coloring may also yield performance improvements and would
128 * be desirable to implement.
4afaaefa 129 *
130 * XXX: Proper hardware cache alignment would be good too.
f1ca4da6 131 */
2fb9b26a 132
ff449ac4 133struct list_head spl_kmem_cache_list; /* List of caches */
134struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
c30df9c8 135
4afaaefa 136static int spl_cache_flush(spl_kmem_cache_t *skc,
137 spl_kmem_magazine_t *skm, int flush);
138
57d86234 139#ifdef HAVE_SET_SHRINKER
2fb9b26a 140static struct shrinker *spl_kmem_cache_shrinker;
57d86234 141#else
4afaaefa 142static int spl_kmem_cache_generic_shrinker(int nr_to_scan,
143 unsigned int gfp_mask);
2fb9b26a 144static struct shrinker spl_kmem_cache_shrinker = {
4afaaefa 145 .shrink = spl_kmem_cache_generic_shrinker,
57d86234 146 .seeks = KMC_DEFAULT_SEEKS,
147};
148#endif
f1ca4da6 149
a1502d76 150static void *
151kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
fece7c99 152{
a1502d76 153 void *ptr;
f1ca4da6 154
a1502d76 155 if (skc->skc_flags & KMC_KMEM) {
156 if (size > (2 * PAGE_SIZE)) {
157 ptr = (void *)__get_free_pages(flags, get_order(size));
158 } else
159 ptr = kmem_alloc(size, flags);
160 } else {
161 ptr = vmem_alloc(size, flags);
d6a26c6a 162 }
fece7c99 163
a1502d76 164 return ptr;
165}
fece7c99 166
a1502d76 167static void
168kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
169{
170 if (skc->skc_flags & KMC_KMEM) {
171 if (size > (2 * PAGE_SIZE))
172 free_pages((unsigned long)ptr, get_order(size));
173 else
174 kmem_free(ptr, size);
175 } else {
176 vmem_free(ptr, size);
177 }
fece7c99 178}
179
180static spl_kmem_slab_t *
a1502d76 181spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
fece7c99 182{
183 spl_kmem_slab_t *sks;
a1502d76 184 spl_kmem_obj_t *sko, *n;
185 void *base, *obj;
186 int i, size, rc = 0;
187
188 /* It's important that we pack the spl_kmem_obj_t structure
189 * and the actual objects in to one large address space
190 * to minimize the number of calls to the allocator. It
191 * is far better to do a few large allocations and then
192 * subdivide it ourselves. Now which allocator we use
193 * requires balancling a few trade offs.
194 *
195 * For small objects we use kmem_alloc() because as long
196 * as you are only requesting a small number of pages
197 * (ideally just one) its cheap. However, when you start
198 * requesting multiple pages kmem_alloc() get increasingly
199 * expensive since it requires contigeous pages. For this
200 * reason we shift to vmem_alloc() for slabs of large
201 * objects which removes the need for contigeous pages.
202 * We do not use vmem_alloc() in all cases because there
203 * is significant locking overhead in __get_vm_area_node().
204 * This function takes a single global lock when aquiring
205 * an available virtual address range which serialize all
206 * vmem_alloc()'s for all slab caches. Using slightly
207 * different allocation functions for small and large
208 * objects should give us the best of both worlds.
fece7c99 209 *
a1502d76 210 * sks struct: sizeof(spl_kmem_slab_t)
211 * obj data: skc->skc_obj_size
212 * obj struct: sizeof(spl_kmem_obj_t)
213 * <N obj data + obj structs>
fece7c99 214 *
215 * XXX: It would probably be a good idea to more carefully
a1502d76 216 * align these data structures in memory.
fece7c99 217 */
a1502d76 218 base = kv_alloc(skc, skc->skc_slab_size, flags);
219 if (base == NULL)
fece7c99 220 RETURN(NULL);
221
a1502d76 222 sks = (spl_kmem_slab_t *)base;
223 sks->sks_magic = SKS_MAGIC;
224 sks->sks_objs = skc->skc_slab_objs;
225 sks->sks_age = jiffies;
226 sks->sks_cache = skc;
227 INIT_LIST_HEAD(&sks->sks_list);
228 INIT_LIST_HEAD(&sks->sks_free_list);
229 sks->sks_ref = 0;
230 size = sizeof(spl_kmem_obj_t) + skc->skc_obj_size;
fece7c99 231
232 for (i = 0; i < sks->sks_objs; i++) {
a1502d76 233 if (skc->skc_flags & KMC_OFFSLAB) {
234 obj = kv_alloc(skc, size, flags);
235 if (!obj)
236 GOTO(out, rc = -ENOMEM);
237 } else {
238 obj = base + sizeof(spl_kmem_slab_t) + i * size;
239 }
240
241 sko = obj + skc->skc_obj_size;
fece7c99 242 sko->sko_addr = obj;
243 sko->sko_magic = SKO_MAGIC;
244 sko->sko_slab = sks;
245 INIT_LIST_HEAD(&sko->sko_list);
fece7c99 246 list_add_tail(&sko->sko_list, &sks->sks_free_list);
247 }
248
fece7c99 249 list_for_each_entry(sko, &sks->sks_free_list, sko_list)
250 if (skc->skc_ctor)
251 skc->skc_ctor(sko->sko_addr, skc->skc_private, flags);
2fb9b26a 252out:
a1502d76 253 if (rc) {
254 if (skc->skc_flags & KMC_OFFSLAB)
255 list_for_each_entry_safe(sko,n,&sks->sks_free_list,sko_list)
256 kv_free(skc, sko->sko_addr, size);
fece7c99 257
a1502d76 258 kv_free(skc, base, skc->skc_slab_size);
259 sks = NULL;
fece7c99 260 }
261
a1502d76 262 RETURN(sks);
fece7c99 263}
264
2fb9b26a 265/* Removes slab from complete or partial list, so it must
d46630e0 266 * be called with the 'skc->skc_lock' held.
fece7c99 267 */
f1ca4da6 268static void
4afaaefa 269spl_slab_free(spl_kmem_slab_t *sks) {
2fb9b26a 270 spl_kmem_cache_t *skc;
271 spl_kmem_obj_t *sko, *n;
a1502d76 272 int size;
2fb9b26a 273 ENTRY;
57d86234 274
2fb9b26a 275 ASSERT(sks->sks_magic == SKS_MAGIC);
4afaaefa 276 ASSERT(sks->sks_ref == 0);
d6a26c6a 277
fece7c99 278 skc = sks->sks_cache;
279 ASSERT(skc->skc_magic == SKC_MAGIC);
d46630e0 280 ASSERT(spin_is_locked(&skc->skc_lock));
f1ca4da6 281
fece7c99 282 skc->skc_obj_total -= sks->sks_objs;
283 skc->skc_slab_total--;
284 list_del(&sks->sks_list);
a1502d76 285 size = sizeof(spl_kmem_obj_t) + skc->skc_obj_size;
937879f1 286
fece7c99 287 /* Run destructors slab is being released */
a1502d76 288 list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) {
289 ASSERT(sko->sko_magic == SKO_MAGIC);
290
2fb9b26a 291 if (skc->skc_dtor)
292 skc->skc_dtor(sko->sko_addr, skc->skc_private);
0a6fd143 293
a1502d76 294 if (skc->skc_flags & KMC_OFFSLAB)
295 kv_free(skc, sko->sko_addr, size);
296 }
d61e12af 297
a1502d76 298 kv_free(skc, sks, skc->skc_slab_size);
2fb9b26a 299 EXIT;
300}
d6a26c6a 301
2fb9b26a 302static int
4afaaefa 303__spl_slab_reclaim(spl_kmem_cache_t *skc)
2fb9b26a 304{
305 spl_kmem_slab_t *sks, *m;
306 int rc = 0;
307 ENTRY;
308
d46630e0 309 ASSERT(spin_is_locked(&skc->skc_lock));
2fb9b26a 310 /*
311 * Free empty slabs which have not been touched in skc_delay
312 * seconds. This delay time is important to avoid thrashing.
313 * Empty slabs will be at the end of the skc_partial_list.
314 */
315 list_for_each_entry_safe_reverse(sks, m, &skc->skc_partial_list,
316 sks_list) {
4afaaefa 317 if (sks->sks_ref > 0)
2fb9b26a 318 break;
319
320 if (time_after(jiffies, sks->sks_age + skc->skc_delay * HZ)) {
4afaaefa 321 spl_slab_free(sks);
2fb9b26a 322 rc++;
323 }
324 }
325
326 /* Returns number of slabs reclaimed */
327 RETURN(rc);
f1ca4da6 328}
329
2fb9b26a 330static int
4afaaefa 331spl_slab_reclaim(spl_kmem_cache_t *skc)
f1ca4da6 332{
2fb9b26a 333 int rc;
334 ENTRY;
f1ca4da6 335
d46630e0 336 spin_lock(&skc->skc_lock);
4afaaefa 337 rc = __spl_slab_reclaim(skc);
d46630e0 338 spin_unlock(&skc->skc_lock);
4efd4118 339
2fb9b26a 340 RETURN(rc);
341}
f1ca4da6 342
4afaaefa 343static int
344spl_magazine_size(spl_kmem_cache_t *skc)
345{
346 int size;
347 ENTRY;
348
349 /* Guesses for reasonable magazine sizes, they
350 * should really adapt based on observed usage. */
351 if (skc->skc_obj_size > (PAGE_SIZE * 256))
4afaaefa 352 size = 4;
ff449ac4 353 else if (skc->skc_obj_size > (PAGE_SIZE * 32))
4afaaefa 354 size = 16;
ff449ac4 355 else if (skc->skc_obj_size > (PAGE_SIZE))
356 size = 64;
4afaaefa 357 else if (skc->skc_obj_size > (PAGE_SIZE / 4))
ff449ac4 358 size = 128;
4afaaefa 359 else
ff449ac4 360 size = 512;
4afaaefa 361
362 RETURN(size);
363}
364
365static spl_kmem_magazine_t *
366spl_magazine_alloc(spl_kmem_cache_t *skc, int node)
367{
368 spl_kmem_magazine_t *skm;
369 int size = sizeof(spl_kmem_magazine_t) +
370 sizeof(void *) * skc->skc_mag_size;
371 ENTRY;
372
3d061e9d 373 skm = kmem_alloc_node(size, GFP_KERNEL, node);
4afaaefa 374 if (skm) {
375 skm->skm_magic = SKM_MAGIC;
376 skm->skm_avail = 0;
377 skm->skm_size = skc->skc_mag_size;
378 skm->skm_refill = skc->skc_mag_refill;
a1502d76 379 if (!(skc->skc_flags & KMC_NOTOUCH))
380 skm->skm_age = jiffies;
4afaaefa 381 }
382
383 RETURN(skm);
384}
385
386static void
387spl_magazine_free(spl_kmem_magazine_t *skm)
388{
389 ENTRY;
390 ASSERT(skm->skm_magic == SKM_MAGIC);
391 ASSERT(skm->skm_avail == 0);
392 kfree(skm);
393 EXIT;
394}
395
396static int
397spl_magazine_create(spl_kmem_cache_t *skc)
398{
399 int i;
400 ENTRY;
401
402 skc->skc_mag_size = spl_magazine_size(skc);
403 skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
404
405 for_each_online_cpu(i) {
406 skc->skc_mag[i] = spl_magazine_alloc(skc, cpu_to_node(i));
407 if (!skc->skc_mag[i]) {
408 for (i--; i >= 0; i--)
409 spl_magazine_free(skc->skc_mag[i]);
410
411 RETURN(-ENOMEM);
412 }
413 }
414
415 RETURN(0);
416}
417
418static void
419spl_magazine_destroy(spl_kmem_cache_t *skc)
420{
421 spl_kmem_magazine_t *skm;
422 int i;
423 ENTRY;
424
425 for_each_online_cpu(i) {
426 skm = skc->skc_mag[i];
427 (void)spl_cache_flush(skc, skm, skm->skm_avail);
428 spl_magazine_free(skm);
429 }
430
431 EXIT;
432}
433
2fb9b26a 434spl_kmem_cache_t *
435spl_kmem_cache_create(char *name, size_t size, size_t align,
436 spl_kmem_ctor_t ctor,
437 spl_kmem_dtor_t dtor,
438 spl_kmem_reclaim_t reclaim,
439 void *priv, void *vmp, int flags)
440{
441 spl_kmem_cache_t *skc;
a1502d76 442 uint32_t slab_max, slab_size, slab_objs;
443 int rc, kmem_flags = KM_SLEEP;
2fb9b26a 444 ENTRY;
937879f1 445
a1502d76 446 ASSERTF(!(flags & KMC_NOMAGAZINE), "Bad KMC_NOMAGAZINE (%x)\n", flags);
447 ASSERTF(!(flags & KMC_NOHASH), "Bad KMC_NOHASH (%x)\n", flags);
448 ASSERTF(!(flags & KMC_QCACHE), "Bad KMC_QCACHE (%x)\n", flags);
449
2fb9b26a 450 /* We may be called when there is a non-zero preempt_count or
451 * interrupts are disabled is which case we must not sleep.
452 */
e9d7a2be 453 if (current_thread_info()->preempt_count || irqs_disabled())
2fb9b26a 454 kmem_flags = KM_NOSLEEP;
0a6fd143 455
2fb9b26a 456 /* Allocate new cache memory and initialize. */
ff449ac4 457 skc = (spl_kmem_cache_t *)kmem_zalloc(sizeof(*skc), kmem_flags);
e9d7a2be 458 if (skc == NULL)
2fb9b26a 459 RETURN(NULL);
d61e12af 460
2fb9b26a 461 skc->skc_magic = SKC_MAGIC;
2fb9b26a 462 skc->skc_name_size = strlen(name) + 1;
463 skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, kmem_flags);
464 if (skc->skc_name == NULL) {
465 kmem_free(skc, sizeof(*skc));
466 RETURN(NULL);
467 }
468 strncpy(skc->skc_name, name, skc->skc_name_size);
469
e9d7a2be 470 skc->skc_ctor = ctor;
471 skc->skc_dtor = dtor;
472 skc->skc_reclaim = reclaim;
2fb9b26a 473 skc->skc_private = priv;
474 skc->skc_vmp = vmp;
475 skc->skc_flags = flags;
476 skc->skc_obj_size = size;
2fb9b26a 477 skc->skc_delay = SPL_KMEM_CACHE_DELAY;
478
2fb9b26a 479 INIT_LIST_HEAD(&skc->skc_list);
480 INIT_LIST_HEAD(&skc->skc_complete_list);
481 INIT_LIST_HEAD(&skc->skc_partial_list);
d46630e0 482 spin_lock_init(&skc->skc_lock);
e9d7a2be 483 skc->skc_slab_fail = 0;
484 skc->skc_slab_create = 0;
485 skc->skc_slab_destroy = 0;
2fb9b26a 486 skc->skc_slab_total = 0;
487 skc->skc_slab_alloc = 0;
488 skc->skc_slab_max = 0;
489 skc->skc_obj_total = 0;
490 skc->skc_obj_alloc = 0;
491 skc->skc_obj_max = 0;
a1502d76 492
493 /* If none passed select a cache type based on object size */
494 if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM))) {
495 if (skc->skc_obj_size < (PAGE_SIZE / 8)) {
496 skc->skc_flags |= KMC_KMEM;
497 } else {
498 skc->skc_flags |= KMC_VMEM;
499 }
500 }
501
502 /* Size slabs properly so ensure they are not too large */
503 slab_max = ((uint64_t)1 << (MAX_ORDER - 1)) * PAGE_SIZE;
504 if (skc->skc_flags & KMC_OFFSLAB) {
505 skc->skc_slab_objs = SPL_KMEM_CACHE_OBJ_PER_SLAB;
506 skc->skc_slab_size = sizeof(spl_kmem_slab_t);
507 ASSERT(skc->skc_obj_size < slab_max);
508 } else {
509 slab_objs = SPL_KMEM_CACHE_OBJ_PER_SLAB + 1;
510
511 do {
512 slab_objs--;
513 slab_size = sizeof(spl_kmem_slab_t) + slab_objs *
514 (skc->skc_obj_size+sizeof(spl_kmem_obj_t));
515 } while (slab_size > slab_max);
516
517 skc->skc_slab_objs = slab_objs;
518 skc->skc_slab_size = slab_size;
519 }
4afaaefa 520
521 rc = spl_magazine_create(skc);
522 if (rc) {
4afaaefa 523 kmem_free(skc->skc_name, skc->skc_name_size);
524 kmem_free(skc, sizeof(*skc));
525 RETURN(NULL);
526 }
2fb9b26a 527
528 down_write(&spl_kmem_cache_sem);
e9d7a2be 529 list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
2fb9b26a 530 up_write(&spl_kmem_cache_sem);
531
e9d7a2be 532 RETURN(skc);
f1ca4da6 533}
2fb9b26a 534EXPORT_SYMBOL(spl_kmem_cache_create);
f1ca4da6 535
2fb9b26a 536void
537spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
f1ca4da6 538{
2fb9b26a 539 spl_kmem_slab_t *sks, *m;
540 ENTRY;
f1ca4da6 541
e9d7a2be 542 ASSERT(skc->skc_magic == SKC_MAGIC);
543
544 down_write(&spl_kmem_cache_sem);
545 list_del_init(&skc->skc_list);
546 up_write(&spl_kmem_cache_sem);
2fb9b26a 547
4afaaefa 548 spl_magazine_destroy(skc);
d46630e0 549 spin_lock(&skc->skc_lock);
d6a26c6a 550
2fb9b26a 551 /* Validate there are no objects in use and free all the
4afaaefa 552 * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */
2fb9b26a 553 ASSERT(list_empty(&skc->skc_complete_list));
a1502d76 554 ASSERT(skc->skc_slab_alloc == 0);
555 ASSERT(skc->skc_obj_alloc == 0);
d6a26c6a 556
e9d7a2be 557 list_for_each_entry_safe(sks, m, &skc->skc_partial_list, sks_list)
4afaaefa 558 spl_slab_free(sks);
2fb9b26a 559
a1502d76 560 ASSERT(skc->skc_slab_total == 0);
561 ASSERT(skc->skc_obj_total == 0);
562
2fb9b26a 563 kmem_free(skc->skc_name, skc->skc_name_size);
d46630e0 564 spin_unlock(&skc->skc_lock);
ff449ac4 565
4afaaefa 566 kmem_free(skc, sizeof(*skc));
2fb9b26a 567
568 EXIT;
f1ca4da6 569}
2fb9b26a 570EXPORT_SYMBOL(spl_kmem_cache_destroy);
f1ca4da6 571
4afaaefa 572static void *
573spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
f1ca4da6 574{
2fb9b26a 575 spl_kmem_obj_t *sko;
f1ca4da6 576
e9d7a2be 577 ASSERT(skc->skc_magic == SKC_MAGIC);
578 ASSERT(sks->sks_magic == SKS_MAGIC);
4afaaefa 579 ASSERT(spin_is_locked(&skc->skc_lock));
2fb9b26a 580
a1502d76 581 sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
4afaaefa 582 ASSERT(sko->sko_magic == SKO_MAGIC);
583 ASSERT(sko->sko_addr != NULL);
2fb9b26a 584
a1502d76 585 /* Remove from sks_free_list */
4afaaefa 586 list_del_init(&sko->sko_list);
2fb9b26a 587
4afaaefa 588 sks->sks_age = jiffies;
589 sks->sks_ref++;
590 skc->skc_obj_alloc++;
2fb9b26a 591
4afaaefa 592 /* Track max obj usage statistics */
593 if (skc->skc_obj_alloc > skc->skc_obj_max)
594 skc->skc_obj_max = skc->skc_obj_alloc;
2fb9b26a 595
4afaaefa 596 /* Track max slab usage statistics */
597 if (sks->sks_ref == 1) {
598 skc->skc_slab_alloc++;
f1ca4da6 599
4afaaefa 600 if (skc->skc_slab_alloc > skc->skc_slab_max)
601 skc->skc_slab_max = skc->skc_slab_alloc;
2fb9b26a 602 }
603
4afaaefa 604 return sko->sko_addr;
605}
c30df9c8 606
4afaaefa 607/* No available objects create a new slab. Since this is an
608 * expensive operation we do it without holding the spinlock
609 * and only briefly aquire it when we link in the fully
610 * allocated and constructed slab.
611 */
612static spl_kmem_slab_t *
613spl_cache_grow(spl_kmem_cache_t *skc, int flags)
614{
e9d7a2be 615 spl_kmem_slab_t *sks;
4afaaefa 616 ENTRY;
f1ca4da6 617
e9d7a2be 618 ASSERT(skc->skc_magic == SKC_MAGIC);
619
620 if (flags & __GFP_WAIT) {
fece7c99 621 flags |= __GFP_NOFAIL;
4afaaefa 622 local_irq_enable();
f78a933f 623 might_sleep();
4afaaefa 624 }
f1ca4da6 625
4afaaefa 626 sks = spl_slab_alloc(skc, flags);
627 if (sks == NULL) {
628 if (flags & __GFP_WAIT)
629 local_irq_disable();
630
631 RETURN(NULL);
632 }
2fb9b26a 633
e9d7a2be 634 if (flags & __GFP_WAIT)
4afaaefa 635 local_irq_disable();
636
637 /* Link the new empty slab in to the end of skc_partial_list */
d46630e0 638 spin_lock(&skc->skc_lock);
2fb9b26a 639 skc->skc_slab_total++;
640 skc->skc_obj_total += sks->sks_objs;
641 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
d46630e0 642 spin_unlock(&skc->skc_lock);
4afaaefa 643
644 RETURN(sks);
f1ca4da6 645}
646
4afaaefa 647static int
648spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
f1ca4da6 649{
e9d7a2be 650 spl_kmem_slab_t *sks;
651 int rc = 0, refill;
937879f1 652 ENTRY;
f1ca4da6 653
e9d7a2be 654 ASSERT(skc->skc_magic == SKC_MAGIC);
655 ASSERT(skm->skm_magic == SKM_MAGIC);
656
4afaaefa 657 /* XXX: Check for refill bouncing by age perhaps */
e9d7a2be 658 refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
4afaaefa 659
d46630e0 660 spin_lock(&skc->skc_lock);
ff449ac4 661
4afaaefa 662 while (refill > 0) {
663 /* No slabs available we must grow the cache */
664 if (list_empty(&skc->skc_partial_list)) {
665 spin_unlock(&skc->skc_lock);
ff449ac4 666
4afaaefa 667 sks = spl_cache_grow(skc, flags);
668 if (!sks)
e9d7a2be 669 GOTO(out, rc);
4afaaefa 670
671 /* Rescheduled to different CPU skm is not local */
672 if (skm != skc->skc_mag[smp_processor_id()])
e9d7a2be 673 GOTO(out, rc);
674
675 /* Potentially rescheduled to the same CPU but
676 * allocations may have occured from this CPU while
677 * we were sleeping so recalculate max refill. */
678 refill = MIN(refill, skm->skm_size - skm->skm_avail);
4afaaefa 679
680 spin_lock(&skc->skc_lock);
681 continue;
682 }
d46630e0 683
4afaaefa 684 /* Grab the next available slab */
685 sks = list_entry((&skc->skc_partial_list)->next,
686 spl_kmem_slab_t, sks_list);
687 ASSERT(sks->sks_magic == SKS_MAGIC);
688 ASSERT(sks->sks_ref < sks->sks_objs);
689 ASSERT(!list_empty(&sks->sks_free_list));
d46630e0 690
4afaaefa 691 /* Consume as many objects as needed to refill the requested
e9d7a2be 692 * cache. We must also be careful not to overfill it. */
693 while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++rc) {
694 ASSERT(skm->skm_avail < skm->skm_size);
695 ASSERT(rc < skm->skm_size);
4afaaefa 696 skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
e9d7a2be 697 }
f1ca4da6 698
4afaaefa 699 /* Move slab to skc_complete_list when full */
700 if (sks->sks_ref == sks->sks_objs) {
701 list_del(&sks->sks_list);
702 list_add(&sks->sks_list, &skc->skc_complete_list);
2fb9b26a 703 }
704 }
57d86234 705
4afaaefa 706 spin_unlock(&skc->skc_lock);
707out:
708 /* Returns the number of entries added to cache */
e9d7a2be 709 RETURN(rc);
4afaaefa 710}
711
712static void
713spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
714{
e9d7a2be 715 spl_kmem_slab_t *sks = NULL;
4afaaefa 716 spl_kmem_obj_t *sko = NULL;
717 ENTRY;
718
e9d7a2be 719 ASSERT(skc->skc_magic == SKC_MAGIC);
4afaaefa 720 ASSERT(spin_is_locked(&skc->skc_lock));
721
a1502d76 722 sko = obj + skc->skc_obj_size;
723 ASSERT(sko->sko_magic == SKO_MAGIC);
4afaaefa 724
725 sks = sko->sko_slab;
a1502d76 726 ASSERT(sks->sks_magic == SKS_MAGIC);
2fb9b26a 727 ASSERT(sks->sks_cache == skc);
2fb9b26a 728 list_add(&sko->sko_list, &sks->sks_free_list);
d6a26c6a 729
2fb9b26a 730 sks->sks_age = jiffies;
4afaaefa 731 sks->sks_ref--;
2fb9b26a 732 skc->skc_obj_alloc--;
f1ca4da6 733
2fb9b26a 734 /* Move slab to skc_partial_list when no longer full. Slabs
4afaaefa 735 * are added to the head to keep the partial list is quasi-full
736 * sorted order. Fuller at the head, emptier at the tail. */
737 if (sks->sks_ref == (sks->sks_objs - 1)) {
2fb9b26a 738 list_del(&sks->sks_list);
739 list_add(&sks->sks_list, &skc->skc_partial_list);
740 }
f1ca4da6 741
2fb9b26a 742 /* Move emply slabs to the end of the partial list so
4afaaefa 743 * they can be easily found and freed during reclamation. */
744 if (sks->sks_ref == 0) {
2fb9b26a 745 list_del(&sks->sks_list);
746 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
747 skc->skc_slab_alloc--;
748 }
749
4afaaefa 750 EXIT;
751}
752
753static int
754spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
755{
756 int i, count = MIN(flush, skm->skm_avail);
757 ENTRY;
758
e9d7a2be 759 ASSERT(skc->skc_magic == SKC_MAGIC);
760 ASSERT(skm->skm_magic == SKM_MAGIC);
4afaaefa 761
762 spin_lock(&skc->skc_lock);
ff449ac4 763
4afaaefa 764 for (i = 0; i < count; i++)
765 spl_cache_shrink(skc, skm->skm_objs[i]);
766
e9d7a2be 767// __spl_slab_reclaim(skc);
768 skm->skm_avail -= count;
769 memmove(skm->skm_objs, &(skm->skm_objs[count]),
4afaaefa 770 sizeof(void *) * skm->skm_avail);
771
d46630e0 772 spin_unlock(&skc->skc_lock);
4afaaefa 773
774 RETURN(count);
775}
776
777void *
778spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
779{
780 spl_kmem_magazine_t *skm;
781 unsigned long irq_flags;
782 void *obj = NULL;
e9d7a2be 783 int id;
4afaaefa 784 ENTRY;
785
e9d7a2be 786 ASSERT(skc->skc_magic == SKC_MAGIC);
787 ASSERT(flags & KM_SLEEP); /* XXX: KM_NOSLEEP not yet supported */
4afaaefa 788 local_irq_save(irq_flags);
789
790restart:
791 /* Safe to update per-cpu structure without lock, but
792 * in the restart case we must be careful to reaquire
793 * the local magazine since this may have changed
794 * when we need to grow the cache. */
e9d7a2be 795 id = smp_processor_id();
796 ASSERTF(id < 4, "cache=%p smp_processor_id=%d\n", skc, id);
4afaaefa 797 skm = skc->skc_mag[smp_processor_id()];
e9d7a2be 798 ASSERTF(skm->skm_magic == SKM_MAGIC, "%x != %x: %s/%p/%p %x/%x/%x\n",
799 skm->skm_magic, SKM_MAGIC, skc->skc_name, skc, skm,
800 skm->skm_size, skm->skm_refill, skm->skm_avail);
4afaaefa 801
802 if (likely(skm->skm_avail)) {
803 /* Object available in CPU cache, use it */
804 obj = skm->skm_objs[--skm->skm_avail];
a1502d76 805 if (!(skc->skc_flags & KMC_NOTOUCH))
806 skm->skm_age = jiffies;
4afaaefa 807 } else {
808 /* Per-CPU cache empty, directly allocate from
809 * the slab and refill the per-CPU cache. */
810 (void)spl_cache_refill(skc, skm, flags);
811 GOTO(restart, obj = NULL);
812 }
813
814 local_irq_restore(irq_flags);
fece7c99 815 ASSERT(obj);
4afaaefa 816
817 /* Pre-emptively migrate object to CPU L1 cache */
818 prefetchw(obj);
819
820 RETURN(obj);
821}
822EXPORT_SYMBOL(spl_kmem_cache_alloc);
823
824void
825spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
826{
827 spl_kmem_magazine_t *skm;
828 unsigned long flags;
829 ENTRY;
830
e9d7a2be 831 ASSERT(skc->skc_magic == SKC_MAGIC);
4afaaefa 832 local_irq_save(flags);
833
834 /* Safe to update per-cpu structure without lock, but
835 * no remote memory allocation tracking is being performed
836 * it is entirely possible to allocate an object from one
837 * CPU cache and return it to another. */
838 skm = skc->skc_mag[smp_processor_id()];
e9d7a2be 839 ASSERT(skm->skm_magic == SKM_MAGIC);
4afaaefa 840
841 /* Per-CPU cache full, flush it to make space */
842 if (unlikely(skm->skm_avail >= skm->skm_size))
843 (void)spl_cache_flush(skc, skm, skm->skm_refill);
844
845 /* Available space in cache, use it */
846 skm->skm_objs[skm->skm_avail++] = obj;
847
848 local_irq_restore(flags);
849
850 EXIT;
f1ca4da6 851}
2fb9b26a 852EXPORT_SYMBOL(spl_kmem_cache_free);
5c2bb9b2 853
2fb9b26a 854static int
4afaaefa 855spl_kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask)
2fb9b26a 856{
e9d7a2be 857 spl_kmem_cache_t *skc;
5c2bb9b2 858
2fb9b26a 859 /* Under linux a shrinker is not tightly coupled with a slab
860 * cache. In fact linux always systematically trys calling all
861 * registered shrinker callbacks until its target reclamation level
862 * is reached. Because of this we only register one shrinker
863 * function in the shim layer for all slab caches. And we always
864 * attempt to shrink all caches when this generic shrinker is called.
c30df9c8 865 */
e9d7a2be 866 down_read(&spl_kmem_cache_sem);
57d86234 867
e9d7a2be 868 list_for_each_entry(skc, &spl_kmem_cache_list, skc_list)
2fb9b26a 869 spl_kmem_cache_reap_now(skc);
870
e9d7a2be 871 up_read(&spl_kmem_cache_sem);
2fb9b26a 872
873 /* XXX: Under linux we should return the remaining number of
874 * entries in the cache. We should do this as well.
875 */
876 return 1;
5c2bb9b2 877}
5c2bb9b2 878
57d86234 879void
2fb9b26a 880spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
57d86234 881{
4afaaefa 882 spl_kmem_magazine_t *skm;
883 int i;
2fb9b26a 884 ENTRY;
e9d7a2be 885
886 ASSERT(skc->skc_magic == SKC_MAGIC);
2fb9b26a 887
888 if (skc->skc_reclaim)
889 skc->skc_reclaim(skc->skc_private);
890
4afaaefa 891 /* Ensure per-CPU caches which are idle gradually flush */
892 for_each_online_cpu(i) {
893 skm = skc->skc_mag[i];
894
895 if (time_after(jiffies, skm->skm_age + skc->skc_delay * HZ))
896 (void)spl_cache_flush(skc, skm, skm->skm_refill);
897 }
898
899 spl_slab_reclaim(skc);
900
2fb9b26a 901 EXIT;
57d86234 902}
2fb9b26a 903EXPORT_SYMBOL(spl_kmem_cache_reap_now);
57d86234 904
f1b59d26 905void
2fb9b26a 906spl_kmem_reap(void)
937879f1 907{
4afaaefa 908 spl_kmem_cache_generic_shrinker(KMC_REAP_CHUNK, GFP_KERNEL);
f1ca4da6 909}
2fb9b26a 910EXPORT_SYMBOL(spl_kmem_reap);
5d86345d 911
ff449ac4 912#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
c6dc93d6 913static char *
4afaaefa 914spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
d6a26c6a 915{
e9d7a2be 916 int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
d6a26c6a 917 int i, flag = 1;
918
919 ASSERT(str != NULL && len >= 17);
e9d7a2be 920 memset(str, 0, len);
d6a26c6a 921
922 /* Check for a fully printable string, and while we are at
923 * it place the printable characters in the passed buffer. */
924 for (i = 0; i < size; i++) {
e9d7a2be 925 str[i] = ((char *)(kd->kd_addr))[i];
926 if (isprint(str[i])) {
927 continue;
928 } else {
929 /* Minimum number of printable characters found
930 * to make it worthwhile to print this as ascii. */
931 if (i > min)
932 break;
933
934 flag = 0;
935 break;
936 }
d6a26c6a 937 }
938
939 if (!flag) {
940 sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
941 *((uint8_t *)kd->kd_addr),
942 *((uint8_t *)kd->kd_addr + 2),
943 *((uint8_t *)kd->kd_addr + 4),
944 *((uint8_t *)kd->kd_addr + 6),
945 *((uint8_t *)kd->kd_addr + 8),
946 *((uint8_t *)kd->kd_addr + 10),
947 *((uint8_t *)kd->kd_addr + 12),
948 *((uint8_t *)kd->kd_addr + 14));
949 }
950
951 return str;
952}
953
a1502d76 954static int
955spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
956{
957 int i;
958 ENTRY;
959
960 spin_lock_init(lock);
961 INIT_LIST_HEAD(list);
962
963 for (i = 0; i < size; i++)
964 INIT_HLIST_HEAD(&kmem_table[i]);
965
966 RETURN(0);
967}
968
ff449ac4 969static void
970spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
5d86345d 971{
2fb9b26a 972 unsigned long flags;
973 kmem_debug_t *kd;
974 char str[17];
a1502d76 975 ENTRY;
2fb9b26a 976
ff449ac4 977 spin_lock_irqsave(lock, flags);
978 if (!list_empty(list))
2fb9b26a 979 CDEBUG(D_WARNING, "%-16s %-5s %-16s %s:%s\n",
980 "address", "size", "data", "func", "line");
981
ff449ac4 982 list_for_each_entry(kd, list, kd_list)
2fb9b26a 983 CDEBUG(D_WARNING, "%p %-5d %-16s %s:%d\n",
984 kd->kd_addr, kd->kd_size,
4afaaefa 985 spl_sprintf_addr(kd, str, 17, 8),
2fb9b26a 986 kd->kd_func, kd->kd_line);
987
ff449ac4 988 spin_unlock_irqrestore(lock, flags);
a1502d76 989 EXIT;
ff449ac4 990}
991#else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
a1502d76 992#define spl_kmem_init_tracking(list, lock, size)
ff449ac4 993#define spl_kmem_fini_tracking(list, lock)
994#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
995
a1502d76 996int
997spl_kmem_init(void)
998{
999 int rc = 0;
1000 ENTRY;
1001
1002 init_rwsem(&spl_kmem_cache_sem);
1003 INIT_LIST_HEAD(&spl_kmem_cache_list);
1004
1005#ifdef HAVE_SET_SHRINKER
1006 spl_kmem_cache_shrinker = set_shrinker(KMC_DEFAULT_SEEKS,
1007 spl_kmem_cache_generic_shrinker);
1008 if (spl_kmem_cache_shrinker == NULL)
f78a933f 1009 RETURN(rc = -ENOMEM);
a1502d76 1010#else
1011 register_shrinker(&spl_kmem_cache_shrinker);
1012#endif
1013
1014#ifdef DEBUG_KMEM
1015 atomic64_set(&kmem_alloc_used, 0);
1016 atomic64_set(&vmem_alloc_used, 0);
1017
1018 spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
1019 spl_kmem_init_tracking(&vmem_list, &vmem_lock, VMEM_TABLE_SIZE);
1020#endif
a1502d76 1021 RETURN(rc);
1022}
1023
ff449ac4 1024void
1025spl_kmem_fini(void)
1026{
1027#ifdef DEBUG_KMEM
1028 /* Display all unreclaimed memory addresses, including the
1029 * allocation size and the first few bytes of what's located
1030 * at that address to aid in debugging. Performance is not
1031 * a serious concern here since it is module unload time. */
1032 if (atomic64_read(&kmem_alloc_used) != 0)
1033 CWARN("kmem leaked %ld/%ld bytes\n",
1034 atomic_read(&kmem_alloc_used), kmem_alloc_max);
1035
2fb9b26a 1036
1037 if (atomic64_read(&vmem_alloc_used) != 0)
1038 CWARN("vmem leaked %ld/%ld bytes\n",
1039 atomic_read(&vmem_alloc_used), vmem_alloc_max);
1040
ff449ac4 1041 spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
1042 spl_kmem_fini_tracking(&vmem_list, &vmem_lock);
1043#endif /* DEBUG_KMEM */
2fb9b26a 1044 ENTRY;
1045
1046#ifdef HAVE_SET_SHRINKER
1047 remove_shrinker(spl_kmem_cache_shrinker);
1048#else
1049 unregister_shrinker(&spl_kmem_cache_shrinker);
5d86345d 1050#endif
2fb9b26a 1051
937879f1 1052 EXIT;
5d86345d 1053}