1 /*****************************************************************************\
2 * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
3 * Copyright (C) 2007 The Regents of the University of California.
4 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
5 * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
8 * This file is part of the SPL, Solaris Porting Layer.
9 * For details, see <http://github.com/behlendorf/spl/>.
11 * The SPL is free software; you can redistribute it and/or modify it
12 * under the terms of the GNU General Public License as published by the
13 * Free Software Foundation; either version 2 of the License, or (at your
14 * option) any later version.
16 * The SPL is distributed in the hope that it will be useful, but WITHOUT
17 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
18 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
21 * You should have received a copy of the GNU General Public License along
22 * with the SPL. If not, see <http://www.gnu.org/licenses/>.
23 \*****************************************************************************/
28 #include <linux/module.h>
29 #include <linux/slab.h>
30 #include <linux/vmalloc.h>
31 #include <linux/mm_compat.h>
32 #include <linux/spinlock.h>
33 #include <linux/rwsem.h>
34 #include <linux/hash.h>
35 #include <linux/ctype.h>
36 #include <asm/atomic.h>
37 #include <sys/types.h>
38 #include <sys/vmsystm.h>
39 #include <sys/kstat.h>
42 * Memory allocation interfaces
44 #define KM_SLEEP GFP_KERNEL /* Can sleep, never fails */
45 #define KM_NOSLEEP GFP_ATOMIC /* Can not sleep, may fail */
46 #define KM_PUSHPAGE (GFP_NOIO | __GFP_HIGH) /* Use reserved memory */
47 #define KM_NODEBUG __GFP_NOWARN /* Suppress warnings */
48 #define KM_FLAGS __GFP_BITS_MASK
49 #define KM_VMFLAGS GFP_LEVEL_MASK
52 * Used internally, the kernel does not need to support this flag
55 # define __GFP_ZERO 0x8000
59 * PF_NOFS is a per-process debug flag which is set in current->flags to
60 * detect when a process is performing an unsafe allocation. All tasks
61 * with PF_NOFS set must strictly use KM_PUSHPAGE for allocations because
62 * if they enter direct reclaim and initiate I/O the may deadlock.
64 * When debugging is disabled, any incorrect usage will be detected and
65 * a call stack with warning will be printed to the console. The flags
66 * will then be automatically corrected to allow for safe execution. If
67 * debugging is enabled this will be treated as a fatal condition.
69 * To avoid any risk of conflicting with the existing PF_ flags. The
70 * PF_NOFS bit shadows the rarely used PF_MUTEX_TESTER bit. Only when
71 * CONFIG_RT_MUTEX_TESTER is not set, and we know this bit is unused,
72 * will the PF_NOFS bit be valid. Happily, most existing distributions
73 * ship a kernel with CONFIG_RT_MUTEX_TESTER disabled.
75 #if !defined(CONFIG_RT_MUTEX_TESTER) && defined(PF_MUTEX_TESTER)
76 # define PF_NOFS PF_MUTEX_TESTER
79 sanitize_flags(struct task_struct
*p
, gfp_t
*flags
)
81 if (unlikely((p
->flags
& PF_NOFS
) && (*flags
& (__GFP_IO
|__GFP_FS
)))) {
83 SDEBUG_LIMIT(SD_CONSOLE
| SD_WARNING
, "Fixing allocation for "
84 "task %s (%d) which used GFP flags 0x%x with PF_NOFS set\n",
85 p
->comm
, p
->pid
, flags
);
86 spl_debug_dumpstack(p
);
87 *flags
&= ~(__GFP_IO
|__GFP_FS
);
89 PANIC("FATAL allocation for task %s (%d) which used GFP "
90 "flags 0x%x with PF_NOFS set\n", p
->comm
, p
->pid
, flags
);
95 # define PF_NOFS 0x00000000
96 # define sanitize_flags(p, fl) ((void)0)
97 #endif /* !defined(CONFIG_RT_MUTEX_TESTER) && defined(PF_MUTEX_TESTER) */
100 * __GFP_NOFAIL looks like it will be removed from the kernel perhaps as
101 * early as 2.6.32. To avoid this issue when it occurs in upstream kernels
102 * we retry the allocation here as long as it is not __GFP_WAIT (GFP_ATOMIC).
103 * I would prefer the caller handle the failure case cleanly but we are
104 * trying to emulate Solaris and those are not the Solaris semantics.
107 kmalloc_nofail(size_t size
, gfp_t flags
)
111 sanitize_flags(current
, &flags
);
114 ptr
= kmalloc(size
, flags
);
115 } while (ptr
== NULL
&& (flags
& __GFP_WAIT
));
121 kzalloc_nofail(size_t size
, gfp_t flags
)
125 sanitize_flags(current
, &flags
);
128 ptr
= kzalloc(size
, flags
);
129 } while (ptr
== NULL
&& (flags
& __GFP_WAIT
));
135 kmalloc_node_nofail(size_t size
, gfp_t flags
, int node
)
137 #ifdef HAVE_KMALLOC_NODE
140 sanitize_flags(current
, &flags
);
143 ptr
= kmalloc_node(size
, flags
, node
);
144 } while (ptr
== NULL
&& (flags
& __GFP_WAIT
));
148 return kmalloc_nofail(size
, flags
);
149 #endif /* HAVE_KMALLOC_NODE */
153 vmalloc_nofail(size_t size
, gfp_t flags
)
157 sanitize_flags(current
, &flags
);
160 * Retry failed __vmalloc() allocations once every second. The
161 * rational for the delay is that the likely failure modes are:
163 * 1) The system has completely exhausted memory, in which case
164 * delaying 1 second for the memory reclaim to run is reasonable
165 * to avoid thrashing the system.
166 * 2) The system has memory but has exhausted the small virtual
167 * address space available on 32-bit systems. Retrying the
168 * allocation immediately will only result in spinning on the
169 * virtual address space lock. It is better delay a second and
170 * hope that another process will free some of the address space.
171 * But the bottom line is there is not much we can actually do
172 * since we can never safely return a failure and honor the
176 ptr
= __vmalloc(size
, flags
| __GFP_HIGHMEM
, PAGE_KERNEL
);
177 if (unlikely((ptr
== NULL
) && (flags
& __GFP_WAIT
))) {
178 set_current_state(TASK_INTERRUPTIBLE
);
179 schedule_timeout(HZ
);
189 vzalloc_nofail(size_t size
, gfp_t flags
)
193 ptr
= vmalloc_nofail(size
, flags
);
195 memset(ptr
, 0, (size
));
203 * Memory accounting functions to be used only when DEBUG_KMEM is set.
205 # ifdef HAVE_ATOMIC64_T
207 # define kmem_alloc_used_add(size) atomic64_add(size, &kmem_alloc_used)
208 # define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used)
209 # define kmem_alloc_used_read() atomic64_read(&kmem_alloc_used)
210 # define kmem_alloc_used_set(size) atomic64_set(&kmem_alloc_used, size)
211 # define vmem_alloc_used_add(size) atomic64_add(size, &vmem_alloc_used)
212 # define vmem_alloc_used_sub(size) atomic64_sub(size, &vmem_alloc_used)
213 # define vmem_alloc_used_read() atomic64_read(&vmem_alloc_used)
214 # define vmem_alloc_used_set(size) atomic64_set(&vmem_alloc_used, size)
216 extern atomic64_t kmem_alloc_used
;
217 extern unsigned long long kmem_alloc_max
;
218 extern atomic64_t vmem_alloc_used
;
219 extern unsigned long long vmem_alloc_max
;
221 # else /* HAVE_ATOMIC64_T */
223 # define kmem_alloc_used_add(size) atomic_add(size, &kmem_alloc_used)
224 # define kmem_alloc_used_sub(size) atomic_sub(size, &kmem_alloc_used)
225 # define kmem_alloc_used_read() atomic_read(&kmem_alloc_used)
226 # define kmem_alloc_used_set(size) atomic_set(&kmem_alloc_used, size)
227 # define vmem_alloc_used_add(size) atomic_add(size, &vmem_alloc_used)
228 # define vmem_alloc_used_sub(size) atomic_sub(size, &vmem_alloc_used)
229 # define vmem_alloc_used_read() atomic_read(&vmem_alloc_used)
230 # define vmem_alloc_used_set(size) atomic_set(&vmem_alloc_used, size)
232 extern atomic_t kmem_alloc_used
;
233 extern unsigned long long kmem_alloc_max
;
234 extern atomic_t vmem_alloc_used
;
235 extern unsigned long long vmem_alloc_max
;
237 # endif /* HAVE_ATOMIC64_T */
239 # ifdef DEBUG_KMEM_TRACKING
241 * DEBUG_KMEM && DEBUG_KMEM_TRACKING
243 * The maximum level of memory debugging. All memory will be accounted
244 * for and each allocation will be explicitly tracked. Any allocation
245 * which is leaked will be reported on module unload and the exact location
246 * where that memory was allocation will be reported. This level of memory
247 * tracking will have a significant impact on performance and should only
248 * be enabled for debugging. This feature may be enabled by passing
249 * --enable-debug-kmem-tracking to configure.
251 # define kmem_alloc(sz, fl) kmem_alloc_track((sz), (fl), \
252 __FUNCTION__, __LINE__, 0, 0)
253 # define kmem_zalloc(sz, fl) kmem_alloc_track((sz), (fl)|__GFP_ZERO,\
254 __FUNCTION__, __LINE__, 0, 0)
255 # define kmem_alloc_node(sz, fl, nd) kmem_alloc_track((sz), (fl), \
256 __FUNCTION__, __LINE__, 1, nd)
257 # define kmem_free(ptr, sz) kmem_free_track((ptr), (sz))
259 # define vmem_alloc(sz, fl) vmem_alloc_track((sz), (fl), \
260 __FUNCTION__, __LINE__)
261 # define vmem_zalloc(sz, fl) vmem_alloc_track((sz), (fl)|__GFP_ZERO,\
262 __FUNCTION__, __LINE__)
263 # define vmem_free(ptr, sz) vmem_free_track((ptr), (sz))
265 extern void *kmem_alloc_track(size_t, int, const char *, int, int, int);
266 extern void kmem_free_track(const void *, size_t);
267 extern void *vmem_alloc_track(size_t, int, const char *, int);
268 extern void vmem_free_track(const void *, size_t);
270 # else /* DEBUG_KMEM_TRACKING */
272 * DEBUG_KMEM && !DEBUG_KMEM_TRACKING
274 * The default build will set DEBUG_KEM. This provides basic memory
275 * accounting with little to no impact on performance. When the module
276 * is unloaded in any memory was leaked the total number of leaked bytes
277 * will be reported on the console. To disable this basic accounting
278 * pass the --disable-debug-kmem option to configure.
280 # define kmem_alloc(sz, fl) kmem_alloc_debug((sz), (fl), \
281 __FUNCTION__, __LINE__, 0, 0)
282 # define kmem_zalloc(sz, fl) kmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
283 __FUNCTION__, __LINE__, 0, 0)
284 # define kmem_alloc_node(sz, fl, nd) kmem_alloc_debug((sz), (fl), \
285 __FUNCTION__, __LINE__, 1, nd)
286 # define kmem_free(ptr, sz) kmem_free_debug((ptr), (sz))
288 # define vmem_alloc(sz, fl) vmem_alloc_debug((sz), (fl), \
289 __FUNCTION__, __LINE__)
290 # define vmem_zalloc(sz, fl) vmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
291 __FUNCTION__, __LINE__)
292 # define vmem_free(ptr, sz) vmem_free_debug((ptr), (sz))
294 extern void *kmem_alloc_debug(size_t, int, const char *, int, int, int);
295 extern void kmem_free_debug(const void *, size_t);
296 extern void *vmem_alloc_debug(size_t, int, const char *, int);
297 extern void vmem_free_debug(const void *, size_t);
299 # endif /* DEBUG_KMEM_TRACKING */
300 #else /* DEBUG_KMEM */
302 * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING
304 * All debugging is disabled. There will be no overhead even for
305 * minimal memory accounting. To enable basic accounting pass the
306 * --enable-debug-kmem option to configure.
308 # define kmem_alloc(sz, fl) kmalloc_nofail((sz), (fl))
309 # define kmem_zalloc(sz, fl) kzalloc_nofail((sz), (fl))
310 # define kmem_alloc_node(sz, fl, nd) kmalloc_node_nofail((sz), (fl), (nd))
311 # define kmem_free(ptr, sz) ((void)(sz), kfree(ptr))
313 # define vmem_alloc(sz, fl) vmalloc_nofail((sz), (fl))
314 # define vmem_zalloc(sz, fl) vzalloc_nofail((sz), (fl))
315 # define vmem_free(ptr, sz) ((void)(sz), vfree(ptr))
317 #endif /* DEBUG_KMEM */
319 extern int kmem_debugging(void);
320 extern char *kmem_vasprintf(const char *fmt
, va_list ap
);
321 extern char *kmem_asprintf(const char *fmt
, ...);
322 extern char *strdup(const char *str
);
323 extern void strfree(char *str
);
327 * Slab allocation interfaces. The SPL slab differs from the standard
328 * Linux SLAB or SLUB primarily in that each cache may be backed by slabs
329 * allocated from the physical or virtal memory address space. The virtual
330 * slabs allow for good behavior when allocation large objects of identical
331 * size. This slab implementation also supports both constructors and
332 * destructions which the Linux slab does not.
335 KMC_BIT_NOTOUCH
= 0, /* Don't update ages */
336 KMC_BIT_NODEBUG
= 1, /* Default behavior */
337 KMC_BIT_NOMAGAZINE
= 2, /* XXX: Unsupported */
338 KMC_BIT_NOHASH
= 3, /* XXX: Unsupported */
339 KMC_BIT_QCACHE
= 4, /* XXX: Unsupported */
340 KMC_BIT_KMEM
= 5, /* Use kmem cache */
341 KMC_BIT_VMEM
= 6, /* Use vmem cache */
342 KMC_BIT_OFFSLAB
= 7, /* Objects not on slab */
343 KMC_BIT_GROWING
= 15, /* Growing in progress */
344 KMC_BIT_REAPING
= 16, /* Reaping in progress */
345 KMC_BIT_DESTROY
= 17, /* Destroy in progress */
346 KMC_BIT_TOTAL
= 18, /* Proc handler helper bit */
347 KMC_BIT_ALLOC
= 19, /* Proc handler helper bit */
348 KMC_BIT_MAX
= 20, /* Proc handler helper bit */
351 /* kmem move callback return values */
352 typedef enum kmem_cbrc
{
353 KMEM_CBRC_YES
= 0, /* Object moved */
354 KMEM_CBRC_NO
= 1, /* Object not moved */
355 KMEM_CBRC_LATER
= 2, /* Object not moved, try again later */
356 KMEM_CBRC_DONT_NEED
= 3, /* Neither object is needed */
357 KMEM_CBRC_DONT_KNOW
= 4, /* Object unknown */
360 #define KMC_NOTOUCH (1 << KMC_BIT_NOTOUCH)
361 #define KMC_NODEBUG (1 << KMC_BIT_NODEBUG)
362 #define KMC_NOMAGAZINE (1 << KMC_BIT_NOMAGAZINE)
363 #define KMC_NOHASH (1 << KMC_BIT_NOHASH)
364 #define KMC_QCACHE (1 << KMC_BIT_QCACHE)
365 #define KMC_KMEM (1 << KMC_BIT_KMEM)
366 #define KMC_VMEM (1 << KMC_BIT_VMEM)
367 #define KMC_OFFSLAB (1 << KMC_BIT_OFFSLAB)
368 #define KMC_GROWING (1 << KMC_BIT_GROWING)
369 #define KMC_REAPING (1 << KMC_BIT_REAPING)
370 #define KMC_DESTROY (1 << KMC_BIT_DESTROY)
371 #define KMC_TOTAL (1 << KMC_BIT_TOTAL)
372 #define KMC_ALLOC (1 << KMC_BIT_ALLOC)
373 #define KMC_MAX (1 << KMC_BIT_MAX)
375 #define KMC_REAP_CHUNK INT_MAX
376 #define KMC_DEFAULT_SEEKS 1
378 extern struct list_head spl_kmem_cache_list
;
379 extern struct rw_semaphore spl_kmem_cache_sem
;
381 #define SKM_MAGIC 0x2e2e2e2e
382 #define SKO_MAGIC 0x20202020
383 #define SKS_MAGIC 0x22222222
384 #define SKC_MAGIC 0x2c2c2c2c
386 #define SPL_KMEM_CACHE_DELAY 15 /* Minimum slab release age */
387 #define SPL_KMEM_CACHE_REAP 0 /* Default reap everything */
388 #define SPL_KMEM_CACHE_OBJ_PER_SLAB 16 /* Target objects per slab */
389 #define SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN 8 /* Minimum objects per slab */
390 #define SPL_KMEM_CACHE_ALIGN 8 /* Default object alignment */
392 #define POINTER_IS_VALID(p) 0 /* Unimplemented */
393 #define POINTER_INVALIDATE(pp) /* Unimplemented */
395 typedef int (*spl_kmem_ctor_t
)(void *, void *, int);
396 typedef void (*spl_kmem_dtor_t
)(void *, void *);
397 typedef void (*spl_kmem_reclaim_t
)(void *);
399 typedef struct spl_kmem_magazine
{
400 uint32_t skm_magic
; /* Sanity magic */
401 uint32_t skm_avail
; /* Available objects */
402 uint32_t skm_size
; /* Magazine size */
403 uint32_t skm_refill
; /* Batch refill size */
404 struct spl_kmem_cache
*skm_cache
; /* Owned by cache */
405 struct delayed_work skm_work
; /* Magazine reclaim work */
406 unsigned long skm_age
; /* Last cache access */
407 unsigned int skm_cpu
; /* Owned by cpu */
408 void *skm_objs
[0]; /* Object pointers */
409 } spl_kmem_magazine_t
;
411 typedef struct spl_kmem_obj
{
412 uint32_t sko_magic
; /* Sanity magic */
413 void *sko_addr
; /* Buffer address */
414 struct spl_kmem_slab
*sko_slab
; /* Owned by slab */
415 struct list_head sko_list
; /* Free object list linkage */
418 typedef struct spl_kmem_slab
{
419 uint32_t sks_magic
; /* Sanity magic */
420 uint32_t sks_objs
; /* Objects per slab */
421 struct spl_kmem_cache
*sks_cache
; /* Owned by cache */
422 struct list_head sks_list
; /* Slab list linkage */
423 struct list_head sks_free_list
; /* Free object list */
424 unsigned long sks_age
; /* Last modify jiffie */
425 uint32_t sks_ref
; /* Ref count used objects */
428 typedef struct spl_kmem_alloc
{
429 struct spl_kmem_cache
*ska_cache
; /* Owned by cache */
430 int ska_flags
; /* Allocation flags */
431 struct delayed_work ska_work
; /* Allocation work */
434 typedef struct spl_kmem_emergency
{
435 void *ske_obj
; /* Buffer address */
436 struct list_head ske_list
; /* Emergency list linkage */
437 } spl_kmem_emergency_t
;
439 typedef struct spl_kmem_cache
{
440 uint32_t skc_magic
; /* Sanity magic */
441 uint32_t skc_name_size
; /* Name length */
442 char *skc_name
; /* Name string */
443 spl_kmem_magazine_t
*skc_mag
[NR_CPUS
]; /* Per-CPU warm cache */
444 uint32_t skc_mag_size
; /* Magazine size */
445 uint32_t skc_mag_refill
; /* Magazine refill count */
446 spl_kmem_ctor_t skc_ctor
; /* Constructor */
447 spl_kmem_dtor_t skc_dtor
; /* Destructor */
448 spl_kmem_reclaim_t skc_reclaim
; /* Reclaimator */
449 void *skc_private
; /* Private data */
450 void *skc_vmp
; /* Unused */
451 unsigned long skc_flags
; /* Flags */
452 uint32_t skc_obj_size
; /* Object size */
453 uint32_t skc_obj_align
; /* Object alignment */
454 uint32_t skc_slab_objs
; /* Objects per slab */
455 uint32_t skc_slab_size
; /* Slab size */
456 uint32_t skc_delay
; /* Slab reclaim interval */
457 uint32_t skc_reap
; /* Slab reclaim count */
458 atomic_t skc_ref
; /* Ref count callers */
459 struct delayed_work skc_work
; /* Slab reclaim work */
460 struct list_head skc_list
; /* List of caches linkage */
461 struct list_head skc_complete_list
;/* Completely alloc'ed */
462 struct list_head skc_partial_list
; /* Partially alloc'ed */
463 struct list_head skc_emergency_list
; /* Min sized objects */
464 spinlock_t skc_lock
; /* Cache lock */
465 wait_queue_head_t skc_waitq
; /* Allocation waiters */
466 uint64_t skc_slab_fail
; /* Slab alloc failures */
467 uint64_t skc_slab_create
;/* Slab creates */
468 uint64_t skc_slab_destroy
;/* Slab destroys */
469 uint64_t skc_slab_total
; /* Slab total current */
470 uint64_t skc_slab_alloc
; /* Slab alloc current */
471 uint64_t skc_slab_max
; /* Slab max historic */
472 uint64_t skc_obj_total
; /* Obj total current */
473 uint64_t skc_obj_alloc
; /* Obj alloc current */
474 uint64_t skc_obj_max
; /* Obj max historic */
475 uint64_t skc_obj_emergency
; /* Obj emergency current */
476 uint64_t skc_obj_emergency_max
; /* Obj emergency max */
478 #define kmem_cache_t spl_kmem_cache_t
480 extern spl_kmem_cache_t
*spl_kmem_cache_create(char *name
, size_t size
,
481 size_t align
, spl_kmem_ctor_t ctor
, spl_kmem_dtor_t dtor
,
482 spl_kmem_reclaim_t reclaim
, void *priv
, void *vmp
, int flags
);
483 extern void spl_kmem_cache_set_move(spl_kmem_cache_t
*,
484 kmem_cbrc_t (*)(void *, void *, size_t, void *));
485 extern void spl_kmem_cache_destroy(spl_kmem_cache_t
*skc
);
486 extern void *spl_kmem_cache_alloc(spl_kmem_cache_t
*skc
, int flags
);
487 extern void spl_kmem_cache_free(spl_kmem_cache_t
*skc
, void *obj
);
488 extern void spl_kmem_cache_reap_now(spl_kmem_cache_t
*skc
, int count
);
489 extern void spl_kmem_reap(void);
491 int spl_kmem_init_kallsyms_lookup(void);
492 int spl_kmem_init(void);
493 void spl_kmem_fini(void);
495 #define kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags) \
496 spl_kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags)
497 #define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move)
498 #define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc)
499 #define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags)
500 #define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj)
501 #define kmem_cache_reap_now(skc) \
502 spl_kmem_cache_reap_now(skc, skc->skc_reap)
503 #define kmem_reap() spl_kmem_reap()
504 #define kmem_virt(ptr) (((ptr) >= (void *)VMALLOC_START) && \
505 ((ptr) < (void *)VMALLOC_END))
507 #endif /* _SPL_KMEM_H */