[mirror_spl.git] / include / sys / kmem.h

/*****************************************************************************\
 *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
 *  Copyright (C) 2007 The Regents of the University of California.
 *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
 *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
 *  UCRL-CODE-235197
 *
 *  This file is part of the SPL, Solaris Porting Layer.
 *  For details, see <http://github.com/behlendorf/spl/>.
 *
 *  The SPL is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU General Public License as published by the
 *  Free Software Foundation; either version 2 of the License, or (at your
 *  option) any later version.
 *
 *  The SPL is distributed in the hope that it will be useful, but WITHOUT
 *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU General Public License along
 *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
\*****************************************************************************/

#ifndef _SPL_KMEM_H
#define	_SPL_KMEM_H

#include <linux/module.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/mm_compat.h>
#include <linux/spinlock.h>
#include <linux/rwsem.h>
#include <linux/hash.h>
#include <linux/ctype.h>
#include <asm/atomic.h>
#include <sys/types.h>
#include <sys/vmsystm.h>
#include <sys/kstat.h>

/*
 * Memory allocation interfaces
 */
#define KM_SLEEP                        GFP_NOFS
#define KM_NOSLEEP                      GFP_ATOMIC
#undef  KM_PANIC                        /* No linux analog */
#define KM_PUSHPAGE                     (KM_SLEEP | __GFP_HIGH)
#define KM_VMFLAGS                      GFP_LEVEL_MASK
#define KM_FLAGS                        __GFP_BITS_MASK
#define KM_NODEBUG                      __GFP_NOWARN

/*
 * Used internally, the kernel does not need to support this flag
 */
#ifndef __GFP_ZERO
# define __GFP_ZERO                     0x8000
#endif

/*
 * __GFP_NOFAIL looks like it will be removed from the kernel perhaps as
 * early as 2.6.32.  To avoid this issue when it occurs in upstream kernels
 * we retry the allocation here as long as it is not __GFP_WAIT (GFP_ATOMIC).
 * I would prefer the caller handle the failure case cleanly but we are
 * trying to emulate Solaris and those are not the Solaris semantics.
 */
static inline void *
kmalloc_nofail(size_t size, gfp_t flags)
{
	void *ptr;

	do {
		ptr = kmalloc(size, flags);
	} while (ptr == NULL && (flags & __GFP_WAIT));

	return ptr;
}

static inline void *
kzalloc_nofail(size_t size, gfp_t flags)
{
	void *ptr;

	do {
		ptr = kzalloc(size, flags);
	} while (ptr == NULL && (flags & __GFP_WAIT));

	return ptr;
}

static inline void *
kmalloc_node_nofail(size_t size, gfp_t flags, int node)
{
#ifdef HAVE_KMALLOC_NODE
	void *ptr;

	do {
		ptr = kmalloc_node(size, flags, node);
	} while (ptr == NULL && (flags & __GFP_WAIT));

	return ptr;
#else
	return kmalloc_nofail(size, flags);
#endif /* HAVE_KMALLOC_NODE */
}

static inline void *
vmalloc_nofail(size_t size, gfp_t flags)
{
	void *ptr;

	/*
	 * Retry failed __vmalloc() allocations once every second.  The
	 * rational for the delay is that the likely failure modes are:
	 *
	 * 1) The system has completely exhausted memory, in which case
	 *    delaying 1 second for the memory reclaim to run is reasonable
	 *    to avoid thrashing the system.
	 * 2) The system has memory but has exhausted the small virtual
	 *    address space available on 32-bit systems.  Retrying the
	 *    allocation immediately will only result in spinning on the
	 *    virtual address space lock.  It is better delay a second and
	 *    hope that another process will free some of the address space.
	 *    But the bottom line is there is not much we can actually do
	 *    since we can never safely return a failure and honor the
	 *    Solaris semantics.
	 */
	while (1) {
		ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
		if (unlikely((ptr == NULL) && (flags & __GFP_WAIT))) {
			set_current_state(TASK_INTERRUPTIBLE);
			schedule_timeout(HZ);
		} else {
			break;
		}
	}

	return ptr;
}

static inline void *
vzalloc_nofail(size_t size, gfp_t flags)
{
	void *ptr;

	ptr = vmalloc_nofail(size, flags);
	if (ptr)
		memset(ptr, 0, (size));

	return ptr;
}

#ifdef DEBUG_KMEM

/*
 * Memory accounting functions to be used only when DEBUG_KMEM is set.
 */
# ifdef HAVE_ATOMIC64_T

# define kmem_alloc_used_add(size)      atomic64_add(size, &kmem_alloc_used)
# define kmem_alloc_used_sub(size)      atomic64_sub(size, &kmem_alloc_used)
# define kmem_alloc_used_read()         atomic64_read(&kmem_alloc_used)
# define kmem_alloc_used_set(size)      atomic64_set(&kmem_alloc_used, size)
# define vmem_alloc_used_add(size)      atomic64_add(size, &vmem_alloc_used)
# define vmem_alloc_used_sub(size)      atomic64_sub(size, &vmem_alloc_used)
# define vmem_alloc_used_read()         atomic64_read(&vmem_alloc_used)
# define vmem_alloc_used_set(size)      atomic64_set(&vmem_alloc_used, size)

extern atomic64_t kmem_alloc_used;
extern unsigned long long kmem_alloc_max;
extern atomic64_t vmem_alloc_used;
extern unsigned long long vmem_alloc_max;

# else  /* HAVE_ATOMIC64_T */

# define kmem_alloc_used_add(size)      atomic_add(size, &kmem_alloc_used)
# define kmem_alloc_used_sub(size)      atomic_sub(size, &kmem_alloc_used)
# define kmem_alloc_used_read()         atomic_read(&kmem_alloc_used)
# define kmem_alloc_used_set(size)      atomic_set(&kmem_alloc_used, size)
# define vmem_alloc_used_add(size)      atomic_add(size, &vmem_alloc_used)
# define vmem_alloc_used_sub(size)      atomic_sub(size, &vmem_alloc_used)
# define vmem_alloc_used_read()         atomic_read(&vmem_alloc_used)
# define vmem_alloc_used_set(size)      atomic_set(&vmem_alloc_used, size)

extern atomic_t kmem_alloc_used;
extern unsigned long long kmem_alloc_max;
extern atomic_t vmem_alloc_used;
extern unsigned long long vmem_alloc_max;

# endif /* HAVE_ATOMIC64_T */

# ifdef DEBUG_KMEM_TRACKING
/*
 * DEBUG_KMEM && DEBUG_KMEM_TRACKING
 *
 * The maximum level of memory debugging.  All memory will be accounted
 * for and each allocation will be explicitly tracked.  Any allocation
 * which is leaked will be reported on module unload and the exact location
 * where that memory was allocation will be reported.  This level of memory
 * tracking will have a significant impact on performance and should only
 * be enabled for debugging.  This feature may be enabled by passing
 * --enable-debug-kmem-tracking to configure.
 */
#  define kmem_alloc(sz, fl)            kmem_alloc_track((sz), (fl),           \
                                             __FUNCTION__, __LINE__, 0, 0)
#  define kmem_zalloc(sz, fl)           kmem_alloc_track((sz), (fl)|__GFP_ZERO,\
                                             __FUNCTION__, __LINE__, 0, 0)
#  define kmem_alloc_node(sz, fl, nd)   kmem_alloc_track((sz), (fl),           \
                                             __FUNCTION__, __LINE__, 1, nd)
#  define kmem_free(ptr, sz)            kmem_free_track((ptr), (sz))

#  define vmem_alloc(sz, fl)            vmem_alloc_track((sz), (fl),           \
                                             __FUNCTION__, __LINE__)
#  define vmem_zalloc(sz, fl)           vmem_alloc_track((sz), (fl)|__GFP_ZERO,\
                                             __FUNCTION__, __LINE__)
#  define vmem_free(ptr, sz)            vmem_free_track((ptr), (sz))

extern void *kmem_alloc_track(size_t, int, const char *, int, int, int);
extern void kmem_free_track(void *, size_t);
extern void *vmem_alloc_track(size_t, int, const char *, int);
extern void vmem_free_track(void *, size_t);

# else /* DEBUG_KMEM_TRACKING */
/*
 * DEBUG_KMEM && !DEBUG_KMEM_TRACKING
 *
 * The default build will set DEBUG_KEM.  This provides basic memory
 * accounting with little to no impact on performance.  When the module
 * is unloaded in any memory was leaked the total number of leaked bytes
 * will be reported on the console.  To disable this basic accounting
 * pass the --disable-debug-kmem option to configure.
 */
#  define kmem_alloc(sz, fl)            kmem_alloc_debug((sz), (fl),           \
                                             __FUNCTION__, __LINE__, 0, 0)
#  define kmem_zalloc(sz, fl)           kmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
                                             __FUNCTION__, __LINE__, 0, 0)
#  define kmem_alloc_node(sz, fl, nd)   kmem_alloc_debug((sz), (fl),           \
                                             __FUNCTION__, __LINE__, 1, nd)
#  define kmem_free(ptr, sz)            kmem_free_debug((ptr), (sz))

#  define vmem_alloc(sz, fl)            vmem_alloc_debug((sz), (fl),           \
                                             __FUNCTION__, __LINE__)
#  define vmem_zalloc(sz, fl)           vmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
                                             __FUNCTION__, __LINE__)
#  define vmem_free(ptr, sz)            vmem_free_debug((ptr), (sz))

extern void *kmem_alloc_debug(size_t, int, const char *, int, int, int);
extern void kmem_free_debug(void *, size_t);
extern void *vmem_alloc_debug(size_t, int, const char *, int);
extern void vmem_free_debug(void *, size_t);

# endif /* DEBUG_KMEM_TRACKING */
#else /* DEBUG_KMEM */
/*
 * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING
 *
 * All debugging is disabled.  There will be no overhead even for
 * minimal memory accounting.  To enable basic accounting pass the
 * --enable-debug-kmem option to configure.
 */
# define kmem_alloc(sz, fl)             kmalloc_nofail((sz), (fl))
# define kmem_zalloc(sz, fl)            kzalloc_nofail((sz), (fl))
# define kmem_alloc_node(sz, fl, nd)    kmalloc_node_nofail((sz), (fl), (nd))
# define kmem_free(ptr, sz)             ((void)(sz), kfree(ptr))

# define vmem_alloc(sz, fl)             vmalloc_nofail((sz), (fl))
# define vmem_zalloc(sz, fl)            vzalloc_nofail((sz), (fl))
# define vmem_free(ptr, sz)             ((void)(sz), vfree(ptr))

#endif /* DEBUG_KMEM */

extern int kmem_debugging(void);
extern char *kmem_vasprintf(const char *fmt, va_list ap);
extern char *kmem_asprintf(const char *fmt, ...);
extern char *strdup(const char *str);
extern void strfree(char *str);


/*
 * Slab allocation interfaces.  The SPL slab differs from the standard
 * Linux SLAB or SLUB primarily in that each cache may be backed by slabs
 * allocated from the physical or virtal memory address space.  The virtual
 * slabs allow for good behavior when allocation large objects of identical
 * size.  This slab implementation also supports both constructors and
 * destructions which the Linux slab does not.
 */
enum {
	KMC_BIT_NOTOUCH		= 0,	/* Don't update ages */
	KMC_BIT_NODEBUG		= 1,	/* Default behavior */
	KMC_BIT_NOMAGAZINE	= 2,	/* XXX: Unsupported */
	KMC_BIT_NOHASH		= 3,	/* XXX: Unsupported */
	KMC_BIT_QCACHE		= 4,	/* XXX: Unsupported */
	KMC_BIT_KMEM		= 5,	/* Use kmem cache */
	KMC_BIT_VMEM		= 6,	/* Use vmem cache */
	KMC_BIT_OFFSLAB		= 7,	/* Objects not on slab */
	KMC_BIT_REAPING		= 16,	/* Reaping in progress */
	KMC_BIT_DESTROY		= 17,	/* Destroy in progress */
};

#define KMC_NOTOUCH		(1 << KMC_BIT_NOTOUCH)
#define KMC_NODEBUG		(1 << KMC_BIT_NODEBUG)
#define KMC_NOMAGAZINE		(1 << KMC_BIT_NOMAGAZINE)
#define KMC_NOHASH		(1 << KMC_BIT_NOHASH)
#define KMC_QCACHE		(1 << KMC_BIT_QCACHE)
#define KMC_KMEM		(1 << KMC_BIT_KMEM)
#define KMC_VMEM		(1 << KMC_BIT_VMEM)
#define KMC_OFFSLAB		(1 << KMC_BIT_OFFSLAB)
#define KMC_REAPING		(1 << KMC_BIT_REAPING)
#define KMC_DESTROY		(1 << KMC_BIT_DESTROY)

#define KMC_REAP_CHUNK			INT_MAX
#define KMC_DEFAULT_SEEKS		1

extern struct list_head spl_kmem_cache_list;
extern struct rw_semaphore spl_kmem_cache_sem;

#define SKM_MAGIC			0x2e2e2e2e
#define SKO_MAGIC			0x20202020
#define SKS_MAGIC			0x22222222
#define SKC_MAGIC			0x2c2c2c2c

#define SPL_KMEM_CACHE_DELAY		15	/* Minimum slab release age */
#define SPL_KMEM_CACHE_REAP		0	/* Default reap everything */
#define SPL_KMEM_CACHE_OBJ_PER_SLAB	32	/* Target objects per slab */
#define SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN	8	/* Minimum objects per slab */
#define SPL_KMEM_CACHE_ALIGN		8	/* Default object alignment */

typedef int (*spl_kmem_ctor_t)(void *, void *, int);
typedef void (*spl_kmem_dtor_t)(void *, void *);
typedef void (*spl_kmem_reclaim_t)(void *);

typedef struct spl_kmem_magazine {
	uint32_t		skm_magic;	/* Sanity magic */
	uint32_t		skm_avail;	/* Available objects */
	uint32_t		skm_size;	/* Magazine size */
	uint32_t		skm_refill;	/* Batch refill size */
	struct spl_kmem_cache	*skm_cache;	/* Owned by cache */
	struct delayed_work	skm_work;	/* Magazine reclaim work */
	unsigned long		skm_age;	/* Last cache access */
	void			*skm_objs[0];	/* Object pointers */
} spl_kmem_magazine_t;

typedef struct spl_kmem_obj {
        uint32_t		sko_magic;	/* Sanity magic */
	void			*sko_addr;	/* Buffer address */
	struct spl_kmem_slab	*sko_slab;	/* Owned by slab */
	struct list_head	sko_list;	/* Free object list linkage */
} spl_kmem_obj_t;

typedef struct spl_kmem_slab {
        uint32_t		sks_magic;	/* Sanity magic */
	uint32_t		sks_objs;	/* Objects per slab */
	struct spl_kmem_cache	*sks_cache;	/* Owned by cache */
	struct list_head	sks_list;	/* Slab list linkage */
	struct list_head	sks_free_list;	/* Free object list */
	unsigned long		sks_age;	/* Last modify jiffie */
	uint32_t		sks_ref;	/* Ref count used objects */
} spl_kmem_slab_t;

typedef struct spl_kmem_cache {
	uint32_t		skc_magic;	/* Sanity magic */
	uint32_t		skc_name_size;	/* Name length */
	char			*skc_name;	/* Name string */
	spl_kmem_magazine_t	*skc_mag[NR_CPUS]; /* Per-CPU warm cache */
	uint32_t		skc_mag_size;	/* Magazine size */
	uint32_t		skc_mag_refill;	/* Magazine refill count */
	spl_kmem_ctor_t		skc_ctor;	/* Constructor */
	spl_kmem_dtor_t		skc_dtor;	/* Destructor */
	spl_kmem_reclaim_t	skc_reclaim;	/* Reclaimator */
	void			*skc_private;	/* Private data */
	void			*skc_vmp;	/* Unused */
	unsigned long		skc_flags;	/* Flags */
	uint32_t		skc_obj_size;	/* Object size */
	uint32_t		skc_obj_align;	/* Object alignment */
	uint32_t		skc_slab_objs;	/* Objects per slab */
	uint32_t		skc_slab_size;	/* Slab size */
	uint32_t		skc_delay;	/* Slab reclaim interval */
	uint32_t		skc_reap;	/* Slab reclaim count */
	atomic_t		skc_ref;	/* Ref count callers */
	struct delayed_work	skc_work;	/* Slab reclaim work */
	struct list_head	skc_list;	/* List of caches linkage */
	struct list_head	skc_complete_list;/* Completely alloc'ed */
	struct list_head	skc_partial_list; /* Partially alloc'ed */
	spinlock_t		skc_lock;	/* Cache lock */
	uint64_t		skc_slab_fail;	/* Slab alloc failures */
	uint64_t		skc_slab_create;/* Slab creates */
	uint64_t		skc_slab_destroy;/* Slab destroys */
	uint64_t		skc_slab_total;	/* Slab total current */
	uint64_t		skc_slab_alloc;	/* Slab alloc current */
	uint64_t		skc_slab_max;	/* Slab max historic  */
	uint64_t		skc_obj_total;	/* Obj total current */
	uint64_t		skc_obj_alloc;	/* Obj alloc current */
	uint64_t		skc_obj_max;	/* Obj max historic */
} spl_kmem_cache_t;
#define kmem_cache_t		spl_kmem_cache_t

extern spl_kmem_cache_t *
spl_kmem_cache_create(char *name, size_t size, size_t align,
        spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, spl_kmem_reclaim_t reclaim,
        void *priv, void *vmp, int flags);

extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc);
extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags);
extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj);
extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc);
extern void spl_kmem_reap(void);

int spl_kmem_init_kallsyms_lookup(void);
int spl_kmem_init(void);
void spl_kmem_fini(void);

#define kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags) \
        spl_kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags)
#define kmem_cache_destroy(skc)		spl_kmem_cache_destroy(skc)
#define kmem_cache_alloc(skc, flags)	spl_kmem_cache_alloc(skc, flags)
#define kmem_cache_free(skc, obj)	spl_kmem_cache_free(skc, obj)
#define kmem_cache_reap_now(skc)	spl_kmem_cache_reap_now(skc)
#define kmem_reap()			spl_kmem_reap()
#define kmem_virt(ptr)			(((ptr) >= (void *)VMALLOC_START) && \
					 ((ptr) <  (void *)VMALLOC_END))

#endif	/* _SPL_KMEM_H */
Commit	Line	Data
716154c5 BB	1	/*****************************************************************************\
	2	* Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
	3	* Copyright (C) 2007 The Regents of the University of California.
	4	* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
	5	* Written by Brian Behlendorf <behlendorf1@llnl.gov>.
715f6251	6	* UCRL-CODE-235197
715f6251	7	*
716154c5 BB	8	* This file is part of the SPL, Solaris Porting Layer.
	9	* For details, see <http://github.com/behlendorf/spl/>.
	10	*
	11	* The SPL is free software; you can redistribute it and/or modify it
	12	* under the terms of the GNU General Public License as published by the
	13	* Free Software Foundation; either version 2 of the License, or (at your
	14	* option) any later version.
715f6251	15	*
716154c5	16	* The SPL is distributed in the hope that it will be useful, but WITHOUT
715f6251	17	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
	18	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	19	* for more details.
	20	*
	21	* You should have received a copy of the GNU General Public License along
716154c5 BB	22	* with the SPL. If not, see <http://www.gnu.org/licenses/>.
716154c5 BB	23	\*****************************************************************************/
715f6251	24
09b414e8	25	#ifndef _SPL_KMEM_H
09b414e8	26	#define _SPL_KMEM_H
f1ca4da6	27
f1b59d26	28	#include <linux/module.h>
f1ca4da6	29	#include <linux/slab.h>
79b31f36	30	#include <linux/vmalloc.h>
baf2979e	31	#include <linux/mm_compat.h>
f1ca4da6	32	#include <linux/spinlock.h>
d6a26c6a	33	#include <linux/rwsem.h>
	34	#include <linux/hash.h>
	35	#include <linux/ctype.h>
d04c8a56	36	#include <asm/atomic.h>
57d86234	37	#include <sys/types.h>
36b313da	38	#include <sys/vmsystm.h>
def465ad	39	#include <sys/kstat.h>
550f1705	40
f1ca4da6	41	/*
	42	* Memory allocation interfaces
	43	*/
82b8c8fa	44	#define KM_SLEEP GFP_NOFS
f1ca4da6	45	#define KM_NOSLEEP GFP_ATOMIC
f1ca4da6	46	#undef KM_PANIC /* No linux analog */
a0f6da3d	47	#define KM_PUSHPAGE (KM_SLEEP \| __GFP_HIGH)
f1ca4da6	48	#define KM_VMFLAGS GFP_LEVEL_MASK
f1ca4da6	49	#define KM_FLAGS __GFP_BITS_MASK
23d91792	50	#define KM_NODEBUG __GFP_NOWARN
f1ca4da6	51
3d061e9d	52	/*
	53	* Used internally, the kernel does not need to support this flag
	54	*/
	55	#ifndef __GFP_ZERO
a0f6da3d	56	# define __GFP_ZERO 0x8000
3d061e9d	57	#endif
3d061e9d	58
c89fdee4 BB	59	/*
	60	* __GFP_NOFAIL looks like it will be removed from the kernel perhaps as
	61	* early as 2.6.32. To avoid this issue when it occurs in upstream kernels
	62	* we retry the allocation here as long as it is not __GFP_WAIT (GFP_ATOMIC).
	63	* I would prefer the caller handle the failure case cleanly but we are
	64	* trying to emulate Solaris and those are not the Solaris semantics.
	65	*/
	66	static inline void *
	67	kmalloc_nofail(size_t size, gfp_t flags)
	68	{
	69	void *ptr;
	70
	71	do {
	72	ptr = kmalloc(size, flags);
	73	} while (ptr == NULL && (flags & __GFP_WAIT));
	74
	75	return ptr;
	76	}
	77
	78	static inline void *
	79	kzalloc_nofail(size_t size, gfp_t flags)
	80	{
	81	void *ptr;
	82
	83	do {
	84	ptr = kzalloc(size, flags);
	85	} while (ptr == NULL && (flags & __GFP_WAIT));
	86
	87	return ptr;
	88	}
	89
c89fdee4 BB	90	static inline void *
	91	kmalloc_node_nofail(size_t size, gfp_t flags, int node)
	92	{
10129680	93	#ifdef HAVE_KMALLOC_NODE
c89fdee4 BB	94	void *ptr;
	95
	96	do {
	97	ptr = kmalloc_node(size, flags, node);
	98	} while (ptr == NULL && (flags & __GFP_WAIT));
	99
	100	return ptr;
10129680 BB	101	#else
10129680 BB	102	return kmalloc_nofail(size, flags);
c89fdee4	103	#endif /* HAVE_KMALLOC_NODE */
10129680 BB	104	}
	105
	106	static inline void *
	107	vmalloc_nofail(size_t size, gfp_t flags)
	108	{
	109	void *ptr;
	110
	111	/*
	112	* Retry failed __vmalloc() allocations once every second. The
	113	* rational for the delay is that the likely failure modes are:
	114	*
	115	* 1) The system has completely exhausted memory, in which case
	116	* delaying 1 second for the memory reclaim to run is reasonable
	117	* to avoid thrashing the system.
	118	* 2) The system has memory but has exhausted the small virtual
	119	* address space available on 32-bit systems. Retrying the
	120	* allocation immediately will only result in spinning on the
	121	* virtual address space lock. It is better delay a second and
	122	* hope that another process will free some of the address space.
	123	* But the bottom line is there is not much we can actually do
	124	* since we can never safely return a failure and honor the
	125	* Solaris semantics.
	126	*/
	127	while (1) {
	128	ptr = __vmalloc(size, flags \| __GFP_HIGHMEM, PAGE_KERNEL);
	129	if (unlikely((ptr == NULL) && (flags & __GFP_WAIT))) {
	130	set_current_state(TASK_INTERRUPTIBLE);
	131	schedule_timeout(HZ);
	132	} else {
	133	break;
	134	}
	135	}
	136
	137	return ptr;
	138	}
	139
	140	static inline void *
	141	vzalloc_nofail(size_t size, gfp_t flags)
	142	{
	143	void *ptr;
	144
	145	ptr = vmalloc_nofail(size, flags);
	146	if (ptr)
	147	memset(ptr, 0, (size));
	148
	149	return ptr;
	150	}
c89fdee4	151
f1ca4da6	152	#ifdef DEBUG_KMEM
a0f6da3d	153
10129680 BB	154	/*
	155	* Memory accounting functions to be used only when DEBUG_KMEM is set.
	156	*/
	157	# ifdef HAVE_ATOMIC64_T
a0f6da3d	158
d04c8a56 BB	159	# define kmem_alloc_used_add(size) atomic64_add(size, &kmem_alloc_used)
	160	# define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used)
	161	# define kmem_alloc_used_read() atomic64_read(&kmem_alloc_used)
	162	# define kmem_alloc_used_set(size) atomic64_set(&kmem_alloc_used, size)
	163	# define vmem_alloc_used_add(size) atomic64_add(size, &vmem_alloc_used)
	164	# define vmem_alloc_used_sub(size) atomic64_sub(size, &vmem_alloc_used)
	165	# define vmem_alloc_used_read() atomic64_read(&vmem_alloc_used)
	166	# define vmem_alloc_used_set(size) atomic64_set(&vmem_alloc_used, size)
	167
10129680	168	extern atomic64_t kmem_alloc_used;
d04c8a56	169	extern unsigned long long kmem_alloc_max;
10129680	170	extern atomic64_t vmem_alloc_used;
d04c8a56 BB	171	extern unsigned long long vmem_alloc_max;
d04c8a56 BB	172
10129680 BB	173	# else /* HAVE_ATOMIC64_T */
10129680 BB	174
d04c8a56 BB	175	# define kmem_alloc_used_add(size) atomic_add(size, &kmem_alloc_used)
	176	# define kmem_alloc_used_sub(size) atomic_sub(size, &kmem_alloc_used)
	177	# define kmem_alloc_used_read() atomic_read(&kmem_alloc_used)
	178	# define kmem_alloc_used_set(size) atomic_set(&kmem_alloc_used, size)
	179	# define vmem_alloc_used_add(size) atomic_add(size, &vmem_alloc_used)
	180	# define vmem_alloc_used_sub(size) atomic_sub(size, &vmem_alloc_used)
	181	# define vmem_alloc_used_read() atomic_read(&vmem_alloc_used)
	182	# define vmem_alloc_used_set(size) atomic_set(&vmem_alloc_used, size)
	183
10129680 BB	184	extern atomic_t kmem_alloc_used;
	185	extern unsigned long long kmem_alloc_max;
	186	extern atomic_t vmem_alloc_used;
	187	extern unsigned long long vmem_alloc_max;
a0f6da3d	188
10129680	189	# endif /* HAVE_ATOMIC64_T */
a0f6da3d	190
a0f6da3d	191	# ifdef DEBUG_KMEM_TRACKING
10129680 BB	192	/*
	193	* DEBUG_KMEM && DEBUG_KMEM_TRACKING
	194	*
	195	* The maximum level of memory debugging. All memory will be accounted
	196	* for and each allocation will be explicitly tracked. Any allocation
	197	* which is leaked will be reported on module unload and the exact location
	198	* where that memory was allocation will be reported. This level of memory
	199	* tracking will have a significant impact on performance and should only
	200	* be enabled for debugging. This feature may be enabled by passing
	201	* --enable-debug-kmem-tracking to configure.
	202	*/
	203	# define kmem_alloc(sz, fl) kmem_alloc_track((sz), (fl), \
	204	__FUNCTION__, __LINE__, 0, 0)
	205	# define kmem_zalloc(sz, fl) kmem_alloc_track((sz), (fl)\|__GFP_ZERO,\
	206	__FUNCTION__, __LINE__, 0, 0)
	207	# define kmem_alloc_node(sz, fl, nd) kmem_alloc_track((sz), (fl), \
	208	__FUNCTION__, __LINE__, 1, nd)
	209	# define kmem_free(ptr, sz) kmem_free_track((ptr), (sz))
	210
	211	# define vmem_alloc(sz, fl) vmem_alloc_track((sz), (fl), \
	212	__FUNCTION__, __LINE__)
	213	# define vmem_zalloc(sz, fl) vmem_alloc_track((sz), (fl)\|__GFP_ZERO,\
	214	__FUNCTION__, __LINE__)
	215	# define vmem_free(ptr, sz) vmem_free_track((ptr), (sz))
	216
	217	extern void kmem_alloc_track(size_t, int, const char , int, int, int);
	218	extern void kmem_free_track(void *, size_t);
	219	extern void vmem_alloc_track(size_t, int, const char , int);
	220	extern void vmem_free_track(void *, size_t);
a0f6da3d	221
a0f6da3d	222	# else /* DEBUG_KMEM_TRACKING */
10129680 BB	223	/*
	224	* DEBUG_KMEM && !DEBUG_KMEM_TRACKING
	225	*
	226	* The default build will set DEBUG_KEM. This provides basic memory
	227	* accounting with little to no impact on performance. When the module
	228	* is unloaded in any memory was leaked the total number of leaked bytes
	229	* will be reported on the console. To disable this basic accounting
	230	* pass the --disable-debug-kmem option to configure.
	231	*/
	232	# define kmem_alloc(sz, fl) kmem_alloc_debug((sz), (fl), \
	233	__FUNCTION__, __LINE__, 0, 0)
	234	# define kmem_zalloc(sz, fl) kmem_alloc_debug((sz), (fl)\|__GFP_ZERO,\
	235	__FUNCTION__, __LINE__, 0, 0)
	236	# define kmem_alloc_node(sz, fl, nd) kmem_alloc_debug((sz), (fl), \
	237	__FUNCTION__, __LINE__, 1, nd)
	238	# define kmem_free(ptr, sz) kmem_free_debug((ptr), (sz))
	239
	240	# define vmem_alloc(sz, fl) vmem_alloc_debug((sz), (fl), \
	241	__FUNCTION__, __LINE__)
	242	# define vmem_zalloc(sz, fl) vmem_alloc_debug((sz), (fl)\|__GFP_ZERO,\
	243	__FUNCTION__, __LINE__)
	244	# define vmem_free(ptr, sz) vmem_free_debug((ptr), (sz))
	245
	246	extern void kmem_alloc_debug(size_t, int, const char , int, int, int);
	247	extern void kmem_free_debug(void *, size_t);
	248	extern void vmem_alloc_debug(size_t, int, const char , int);
	249	extern void vmem_free_debug(void *, size_t);
a0f6da3d	250
a0f6da3d	251	# endif /* DEBUG_KMEM_TRACKING */
c6dc93d6	252	#else /* DEBUG_KMEM */
10129680 BB	253	/*
	254	* !DEBUG_KMEM && !DEBUG_KMEM_TRACKING
	255	*
	256	* All debugging is disabled. There will be no overhead even for
	257	* minimal memory accounting. To enable basic accounting pass the
	258	* --enable-debug-kmem option to configure.
	259	*/
	260	# define kmem_alloc(sz, fl) kmalloc_nofail((sz), (fl))
	261	# define kmem_zalloc(sz, fl) kzalloc_nofail((sz), (fl))
	262	# define kmem_alloc_node(sz, fl, nd) kmalloc_node_nofail((sz), (fl), (nd))
	263	# define kmem_free(ptr, sz) ((void)(sz), kfree(ptr))
f1ca4da6	264
10129680 BB	265	# define vmem_alloc(sz, fl) vmalloc_nofail((sz), (fl))
	266	# define vmem_zalloc(sz, fl) vzalloc_nofail((sz), (fl))
	267	# define vmem_free(ptr, sz) ((void)(sz), vfree(ptr))
79b31f36	268
f1ca4da6	269	#endif /* DEBUG_KMEM */
f1ca4da6	270
10129680 BB	271	extern int kmem_debugging(void);
	272	extern char kmem_vasprintf(const char fmt, va_list ap);
	273	extern char kmem_asprintf(const char fmt, ...);
	274	extern char strdup(const char str);
	275	extern void strfree(char *str);
	276
	277
f1ca4da6	278	/*
10129680 BB	279	* Slab allocation interfaces. The SPL slab differs from the standard
	280	* Linux SLAB or SLUB primarily in that each cache may be backed by slabs
	281	* allocated from the physical or virtal memory address space. The virtual
	282	* slabs allow for good behavior when allocation large objects of identical
	283	* size. This slab implementation also supports both constructors and
	284	* destructions which the Linux slab does not.
f1ca4da6	285	*/
ea3e6ca9 BB	286	enum {
	287	KMC_BIT_NOTOUCH = 0, /* Don't update ages */
	288	KMC_BIT_NODEBUG = 1, /* Default behavior */
	289	KMC_BIT_NOMAGAZINE = 2, /* XXX: Unsupported */
	290	KMC_BIT_NOHASH = 3, /* XXX: Unsupported */
	291	KMC_BIT_QCACHE = 4, /* XXX: Unsupported */
	292	KMC_BIT_KMEM = 5, /* Use kmem cache */
	293	KMC_BIT_VMEM = 6, /* Use vmem cache */
	294	KMC_BIT_OFFSLAB = 7, /* Objects not on slab */
	295	KMC_BIT_REAPING = 16, /* Reaping in progress */
	296	KMC_BIT_DESTROY = 17, /* Destroy in progress */
	297	};
	298
	299	#define KMC_NOTOUCH (1 << KMC_BIT_NOTOUCH)
	300	#define KMC_NODEBUG (1 << KMC_BIT_NODEBUG)
	301	#define KMC_NOMAGAZINE (1 << KMC_BIT_NOMAGAZINE)
	302	#define KMC_NOHASH (1 << KMC_BIT_NOHASH)
	303	#define KMC_QCACHE (1 << KMC_BIT_QCACHE)
	304	#define KMC_KMEM (1 << KMC_BIT_KMEM)
	305	#define KMC_VMEM (1 << KMC_BIT_VMEM)
	306	#define KMC_OFFSLAB (1 << KMC_BIT_OFFSLAB)
	307	#define KMC_REAPING (1 << KMC_BIT_REAPING)
	308	#define KMC_DESTROY (1 << KMC_BIT_DESTROY)
	309
	310	#define KMC_REAP_CHUNK INT_MAX
	311	#define KMC_DEFAULT_SEEKS 1
f1ca4da6	312
ff449ac4	313	extern struct list_head spl_kmem_cache_list;
ff449ac4	314	extern struct rw_semaphore spl_kmem_cache_sem;
2fb9b26a	315
4afaaefa	316	#define SKM_MAGIC 0x2e2e2e2e
2fb9b26a	317	#define SKO_MAGIC 0x20202020
	318	#define SKS_MAGIC 0x22222222
	319	#define SKC_MAGIC 0x2c2c2c2c
	320
37db7d8c BB	321	#define SPL_KMEM_CACHE_DELAY 15 /* Minimum slab release age */
37db7d8c BB	322	#define SPL_KMEM_CACHE_REAP 0 /* Default reap everything */
ea3e6ca9 BB	323	#define SPL_KMEM_CACHE_OBJ_PER_SLAB 32 /* Target objects per slab */
	324	#define SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN 8 /* Minimum objects per slab */
	325	#define SPL_KMEM_CACHE_ALIGN 8 /* Default object alignment */
2fb9b26a	326
	327	typedef int (spl_kmem_ctor_t)(void , void *, int);
	328	typedef void (spl_kmem_dtor_t)(void , void *);
	329	typedef void (spl_kmem_reclaim_t)(void );
	330
4afaaefa	331	typedef struct spl_kmem_magazine {
9b1b8e4c	332	uint32_t skm_magic; /* Sanity magic */
4afaaefa	333	uint32_t skm_avail; /* Available objects */
	334	uint32_t skm_size; /* Magazine size */
	335	uint32_t skm_refill; /* Batch refill size */
9b1b8e4c BB	336	struct spl_kmem_cache skm_cache; / Owned by cache */
9b1b8e4c BB	337	struct delayed_work skm_work; /* Magazine reclaim work */
4afaaefa	338	unsigned long skm_age; /* Last cache access */
	339	void skm_objs[0]; / Object pointers */
	340	} spl_kmem_magazine_t;
	341
2fb9b26a	342	typedef struct spl_kmem_obj {
2fb9b26a	343	uint32_t sko_magic; /* Sanity magic */
2fb9b26a	344	void sko_addr; / Buffer address */
	345	struct spl_kmem_slab sko_slab; / Owned by slab */
	346	struct list_head sko_list; /* Free object list linkage */
2fb9b26a	347	} spl_kmem_obj_t;
	348
	349	typedef struct spl_kmem_slab {
	350	uint32_t sks_magic; /* Sanity magic */
	351	uint32_t sks_objs; /* Objects per slab */
	352	struct spl_kmem_cache sks_cache; / Owned by cache */
	353	struct list_head sks_list; /* Slab list linkage */
	354	struct list_head sks_free_list; /* Free object list */
	355	unsigned long sks_age; /* Last modify jiffie */
4afaaefa	356	uint32_t sks_ref; /* Ref count used objects */
2fb9b26a	357	} spl_kmem_slab_t;
	358
	359	typedef struct spl_kmem_cache {
ea3e6ca9 BB	360	uint32_t skc_magic; /* Sanity magic */
	361	uint32_t skc_name_size; /* Name length */
	362	char skc_name; / Name string */
4afaaefa	363	spl_kmem_magazine_t skc_mag[NR_CPUS]; / Per-CPU warm cache */
	364	uint32_t skc_mag_size; /* Magazine size */
	365	uint32_t skc_mag_refill; /* Magazine refill count */
ea3e6ca9 BB	366	spl_kmem_ctor_t skc_ctor; /* Constructor */
	367	spl_kmem_dtor_t skc_dtor; /* Destructor */
	368	spl_kmem_reclaim_t skc_reclaim; /* Reclaimator */
	369	void skc_private; / Private data */
	370	void skc_vmp; / Unused */
31a033ec	371	unsigned long skc_flags; /* Flags */
2fb9b26a	372	uint32_t skc_obj_size; /* Object size */
48e0606a	373	uint32_t skc_obj_align; /* Object alignment */
a1502d76	374	uint32_t skc_slab_objs; /* Objects per slab */
ea3e6ca9 BB	375	uint32_t skc_slab_size; /* Slab size */
ea3e6ca9 BB	376	uint32_t skc_delay; /* Slab reclaim interval */
37db7d8c	377	uint32_t skc_reap; /* Slab reclaim count */
ea3e6ca9 BB	378	atomic_t skc_ref; /* Ref count callers */
ea3e6ca9 BB	379	struct delayed_work skc_work; /* Slab reclaim work */
ea3e6ca9	380	struct list_head skc_list; /* List of caches linkage */
2fb9b26a	381	struct list_head skc_complete_list;/* Completely alloc'ed */
2fb9b26a	382	struct list_head skc_partial_list; /* Partially alloc'ed */
d46630e0	383	spinlock_t skc_lock; /* Cache lock */
2fb9b26a	384	uint64_t skc_slab_fail; /* Slab alloc failures */
	385	uint64_t skc_slab_create;/* Slab creates */
	386	uint64_t skc_slab_destroy;/* Slab destroys */
d46630e0	387	uint64_t skc_slab_total; /* Slab total current */
ea3e6ca9	388	uint64_t skc_slab_alloc; /* Slab alloc current */
d46630e0	389	uint64_t skc_slab_max; /* Slab max historic */
	390	uint64_t skc_obj_total; /* Obj total current */
	391	uint64_t skc_obj_alloc; /* Obj alloc current */
	392	uint64_t skc_obj_max; /* Obj max historic */
2fb9b26a	393	} spl_kmem_cache_t;
7afde631	394	#define kmem_cache_t spl_kmem_cache_t
2fb9b26a	395
	396	extern spl_kmem_cache_t *
	397	spl_kmem_cache_create(char *name, size_t size, size_t align,
	398	spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, spl_kmem_reclaim_t reclaim,
f1ca4da6	399	void priv, void vmp, int flags);
f1ca4da6	400
2fb9b26a	401	extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc);
	402	extern void spl_kmem_cache_alloc(spl_kmem_cache_t skc, int flags);
	403	extern void spl_kmem_cache_free(spl_kmem_cache_t skc, void obj);
	404	extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc);
	405	extern void spl_kmem_reap(void);
f1ca4da6	406
d1ff2312	407	int spl_kmem_init_kallsyms_lookup(void);
2fb9b26a	408	int spl_kmem_init(void);
2fb9b26a	409	void spl_kmem_fini(void);
5d86345d	410
f1ca4da6	411	#define kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags) \
2fb9b26a	412	spl_kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags)
	413	#define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc)
	414	#define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags)
	415	#define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj)
	416	#define kmem_cache_reap_now(skc) spl_kmem_cache_reap_now(skc)
	417	#define kmem_reap() spl_kmem_reap()
a1502d76	418	#define kmem_virt(ptr) (((ptr) >= (void *)VMALLOC_START) && \
a1502d76	419	((ptr) < (void *)VMALLOC_END))
f1ca4da6	420
09b414e8	421	#endif /* _SPL_KMEM_H */