]> git.proxmox.com Git - mirror_spl.git/blob - include/sys/kmem.h
Revert "Add PF_NOFS debugging flag"
[mirror_spl.git] / include / sys / kmem.h
1 /*****************************************************************************\
2 * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
3 * Copyright (C) 2007 The Regents of the University of California.
4 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
5 * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
6 * UCRL-CODE-235197
7 *
8 * This file is part of the SPL, Solaris Porting Layer.
9 * For details, see <http://zfsonlinux.org/>.
10 *
11 * The SPL is free software; you can redistribute it and/or modify it
12 * under the terms of the GNU General Public License as published by the
13 * Free Software Foundation; either version 2 of the License, or (at your
14 * option) any later version.
15 *
16 * The SPL is distributed in the hope that it will be useful, but WITHOUT
17 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
18 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
19 * for more details.
20 *
21 * You should have received a copy of the GNU General Public License along
22 * with the SPL. If not, see <http://www.gnu.org/licenses/>.
23 \*****************************************************************************/
24
25 #ifndef _SPL_KMEM_H
26 #define _SPL_KMEM_H
27
28 #include <linux/module.h>
29 #include <linux/slab.h>
30 #include <linux/vmalloc.h>
31 #include <linux/spinlock.h>
32 #include <linux/rwsem.h>
33 #include <linux/hash.h>
34 #include <linux/rbtree.h>
35 #include <linux/ctype.h>
36 #include <asm/atomic.h>
37 #include <sys/types.h>
38 #include <sys/vmsystm.h>
39 #include <sys/kstat.h>
40 #include <sys/taskq.h>
41
42 /*
43 * Memory allocation interfaces
44 */
45 #define KM_SLEEP GFP_KERNEL /* Can sleep, never fails */
46 #define KM_NOSLEEP GFP_ATOMIC /* Can not sleep, may fail */
47 #define KM_PUSHPAGE (GFP_NOIO | __GFP_HIGH) /* Use reserved memory */
48 #define KM_NODEBUG __GFP_NOWARN /* Suppress warnings */
49 #define KM_FLAGS __GFP_BITS_MASK
50 #define KM_VMFLAGS GFP_LEVEL_MASK
51
52 /*
53 * Used internally, the kernel does not need to support this flag
54 */
55 #ifndef __GFP_ZERO
56 # define __GFP_ZERO 0x8000
57 #endif
58
59 /*
60 * __GFP_NOFAIL looks like it will be removed from the kernel perhaps as
61 * early as 2.6.32. To avoid this issue when it occurs in upstream kernels
62 * we retry the allocation here as long as it is not __GFP_WAIT (GFP_ATOMIC).
63 * I would prefer the caller handle the failure case cleanly but we are
64 * trying to emulate Solaris and those are not the Solaris semantics.
65 */
66 static inline void *
67 kmalloc_nofail(size_t size, gfp_t flags)
68 {
69 void *ptr;
70
71 do {
72 ptr = kmalloc(size, flags);
73 } while (ptr == NULL && (flags & __GFP_WAIT));
74
75 return ptr;
76 }
77
78 static inline void *
79 kzalloc_nofail(size_t size, gfp_t flags)
80 {
81 void *ptr;
82
83 do {
84 ptr = kzalloc(size, flags);
85 } while (ptr == NULL && (flags & __GFP_WAIT));
86
87 return ptr;
88 }
89
90 static inline void *
91 kmalloc_node_nofail(size_t size, gfp_t flags, int node)
92 {
93 void *ptr;
94
95 do {
96 ptr = kmalloc_node(size, flags, node);
97 } while (ptr == NULL && (flags & __GFP_WAIT));
98
99 return ptr;
100 }
101
102 static inline void *
103 vmalloc_nofail(size_t size, gfp_t flags)
104 {
105 void *ptr;
106
107 /*
108 * Retry failed __vmalloc() allocations once every second. The
109 * rational for the delay is that the likely failure modes are:
110 *
111 * 1) The system has completely exhausted memory, in which case
112 * delaying 1 second for the memory reclaim to run is reasonable
113 * to avoid thrashing the system.
114 * 2) The system has memory but has exhausted the small virtual
115 * address space available on 32-bit systems. Retrying the
116 * allocation immediately will only result in spinning on the
117 * virtual address space lock. It is better delay a second and
118 * hope that another process will free some of the address space.
119 * But the bottom line is there is not much we can actually do
120 * since we can never safely return a failure and honor the
121 * Solaris semantics.
122 */
123 while (1) {
124 ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
125 if (unlikely((ptr == NULL) && (flags & __GFP_WAIT))) {
126 set_current_state(TASK_INTERRUPTIBLE);
127 schedule_timeout(HZ);
128 } else {
129 break;
130 }
131 }
132
133 return ptr;
134 }
135
136 static inline void *
137 vzalloc_nofail(size_t size, gfp_t flags)
138 {
139 void *ptr;
140
141 ptr = vmalloc_nofail(size, flags);
142 if (ptr)
143 memset(ptr, 0, (size));
144
145 return ptr;
146 }
147
148 #ifdef DEBUG_KMEM
149
150 /*
151 * Memory accounting functions to be used only when DEBUG_KMEM is set.
152 */
153 # ifdef HAVE_ATOMIC64_T
154
155 # define kmem_alloc_used_add(size) atomic64_add(size, &kmem_alloc_used)
156 # define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used)
157 # define kmem_alloc_used_read() atomic64_read(&kmem_alloc_used)
158 # define kmem_alloc_used_set(size) atomic64_set(&kmem_alloc_used, size)
159 # define vmem_alloc_used_add(size) atomic64_add(size, &vmem_alloc_used)
160 # define vmem_alloc_used_sub(size) atomic64_sub(size, &vmem_alloc_used)
161 # define vmem_alloc_used_read() atomic64_read(&vmem_alloc_used)
162 # define vmem_alloc_used_set(size) atomic64_set(&vmem_alloc_used, size)
163
164 extern atomic64_t kmem_alloc_used;
165 extern unsigned long long kmem_alloc_max;
166 extern atomic64_t vmem_alloc_used;
167 extern unsigned long long vmem_alloc_max;
168
169 # else /* HAVE_ATOMIC64_T */
170
171 # define kmem_alloc_used_add(size) atomic_add(size, &kmem_alloc_used)
172 # define kmem_alloc_used_sub(size) atomic_sub(size, &kmem_alloc_used)
173 # define kmem_alloc_used_read() atomic_read(&kmem_alloc_used)
174 # define kmem_alloc_used_set(size) atomic_set(&kmem_alloc_used, size)
175 # define vmem_alloc_used_add(size) atomic_add(size, &vmem_alloc_used)
176 # define vmem_alloc_used_sub(size) atomic_sub(size, &vmem_alloc_used)
177 # define vmem_alloc_used_read() atomic_read(&vmem_alloc_used)
178 # define vmem_alloc_used_set(size) atomic_set(&vmem_alloc_used, size)
179
180 extern atomic_t kmem_alloc_used;
181 extern unsigned long long kmem_alloc_max;
182 extern atomic_t vmem_alloc_used;
183 extern unsigned long long vmem_alloc_max;
184
185 # endif /* HAVE_ATOMIC64_T */
186
187 # ifdef DEBUG_KMEM_TRACKING
188 /*
189 * DEBUG_KMEM && DEBUG_KMEM_TRACKING
190 *
191 * The maximum level of memory debugging. All memory will be accounted
192 * for and each allocation will be explicitly tracked. Any allocation
193 * which is leaked will be reported on module unload and the exact location
194 * where that memory was allocation will be reported. This level of memory
195 * tracking will have a significant impact on performance and should only
196 * be enabled for debugging. This feature may be enabled by passing
197 * --enable-debug-kmem-tracking to configure.
198 */
199 # define kmem_alloc(sz, fl) kmem_alloc_track((sz), (fl), \
200 __FUNCTION__, __LINE__, 0, 0)
201 # define kmem_zalloc(sz, fl) kmem_alloc_track((sz), (fl)|__GFP_ZERO,\
202 __FUNCTION__, __LINE__, 0, 0)
203 # define kmem_alloc_node(sz, fl, nd) kmem_alloc_track((sz), (fl), \
204 __FUNCTION__, __LINE__, 1, nd)
205 # define kmem_free(ptr, sz) kmem_free_track((ptr), (sz))
206
207 # define vmem_alloc(sz, fl) vmem_alloc_track((sz), (fl), \
208 __FUNCTION__, __LINE__)
209 # define vmem_zalloc(sz, fl) vmem_alloc_track((sz), (fl)|__GFP_ZERO,\
210 __FUNCTION__, __LINE__)
211 # define vmem_free(ptr, sz) vmem_free_track((ptr), (sz))
212
213 extern void *kmem_alloc_track(size_t, int, const char *, int, int, int);
214 extern void kmem_free_track(const void *, size_t);
215 extern void *vmem_alloc_track(size_t, int, const char *, int);
216 extern void vmem_free_track(const void *, size_t);
217
218 # else /* DEBUG_KMEM_TRACKING */
219 /*
220 * DEBUG_KMEM && !DEBUG_KMEM_TRACKING
221 *
222 * The default build will set DEBUG_KEM. This provides basic memory
223 * accounting with little to no impact on performance. When the module
224 * is unloaded in any memory was leaked the total number of leaked bytes
225 * will be reported on the console. To disable this basic accounting
226 * pass the --disable-debug-kmem option to configure.
227 */
228 # define kmem_alloc(sz, fl) kmem_alloc_debug((sz), (fl), \
229 __FUNCTION__, __LINE__, 0, 0)
230 # define kmem_zalloc(sz, fl) kmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
231 __FUNCTION__, __LINE__, 0, 0)
232 # define kmem_alloc_node(sz, fl, nd) kmem_alloc_debug((sz), (fl), \
233 __FUNCTION__, __LINE__, 1, nd)
234 # define kmem_free(ptr, sz) kmem_free_debug((ptr), (sz))
235
236 # define vmem_alloc(sz, fl) vmem_alloc_debug((sz), (fl), \
237 __FUNCTION__, __LINE__)
238 # define vmem_zalloc(sz, fl) vmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
239 __FUNCTION__, __LINE__)
240 # define vmem_free(ptr, sz) vmem_free_debug((ptr), (sz))
241
242 extern void *kmem_alloc_debug(size_t, int, const char *, int, int, int);
243 extern void kmem_free_debug(const void *, size_t);
244 extern void *vmem_alloc_debug(size_t, int, const char *, int);
245 extern void vmem_free_debug(const void *, size_t);
246
247 # endif /* DEBUG_KMEM_TRACKING */
248 #else /* DEBUG_KMEM */
249 /*
250 * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING
251 *
252 * All debugging is disabled. There will be no overhead even for
253 * minimal memory accounting. To enable basic accounting pass the
254 * --enable-debug-kmem option to configure.
255 */
256 # define kmem_alloc(sz, fl) kmalloc_nofail((sz), (fl))
257 # define kmem_zalloc(sz, fl) kzalloc_nofail((sz), (fl))
258 # define kmem_alloc_node(sz, fl, nd) kmalloc_node_nofail((sz), (fl), (nd))
259 # define kmem_free(ptr, sz) ((void)(sz), kfree(ptr))
260
261 # define vmem_alloc(sz, fl) vmalloc_nofail((sz), (fl))
262 # define vmem_zalloc(sz, fl) vzalloc_nofail((sz), (fl))
263 # define vmem_free(ptr, sz) ((void)(sz), vfree(ptr))
264
265 #endif /* DEBUG_KMEM */
266
267 extern int kmem_debugging(void);
268 extern char *kmem_vasprintf(const char *fmt, va_list ap);
269 extern char *kmem_asprintf(const char *fmt, ...);
270 extern char *strdup(const char *str);
271 extern void strfree(char *str);
272
273
274 /*
275 * Slab allocation interfaces. The SPL slab differs from the standard
276 * Linux SLAB or SLUB primarily in that each cache may be backed by slabs
277 * allocated from the physical or virtal memory address space. The virtual
278 * slabs allow for good behavior when allocation large objects of identical
279 * size. This slab implementation also supports both constructors and
280 * destructions which the Linux slab does not.
281 */
282 enum {
283 KMC_BIT_NOTOUCH = 0, /* Don't update ages */
284 KMC_BIT_NODEBUG = 1, /* Default behavior */
285 KMC_BIT_NOMAGAZINE = 2, /* XXX: Unsupported */
286 KMC_BIT_NOHASH = 3, /* XXX: Unsupported */
287 KMC_BIT_QCACHE = 4, /* XXX: Unsupported */
288 KMC_BIT_KMEM = 5, /* Use kmem cache */
289 KMC_BIT_VMEM = 6, /* Use vmem cache */
290 KMC_BIT_SLAB = 7, /* Use Linux slab cache */
291 KMC_BIT_OFFSLAB = 8, /* Objects not on slab */
292 KMC_BIT_NOEMERGENCY = 9, /* Disable emergency objects */
293 KMC_BIT_DEADLOCKED = 14, /* Deadlock detected */
294 KMC_BIT_GROWING = 15, /* Growing in progress */
295 KMC_BIT_REAPING = 16, /* Reaping in progress */
296 KMC_BIT_DESTROY = 17, /* Destroy in progress */
297 KMC_BIT_TOTAL = 18, /* Proc handler helper bit */
298 KMC_BIT_ALLOC = 19, /* Proc handler helper bit */
299 KMC_BIT_MAX = 20, /* Proc handler helper bit */
300 };
301
302 /* kmem move callback return values */
303 typedef enum kmem_cbrc {
304 KMEM_CBRC_YES = 0, /* Object moved */
305 KMEM_CBRC_NO = 1, /* Object not moved */
306 KMEM_CBRC_LATER = 2, /* Object not moved, try again later */
307 KMEM_CBRC_DONT_NEED = 3, /* Neither object is needed */
308 KMEM_CBRC_DONT_KNOW = 4, /* Object unknown */
309 } kmem_cbrc_t;
310
311 #define KMC_NOTOUCH (1 << KMC_BIT_NOTOUCH)
312 #define KMC_NODEBUG (1 << KMC_BIT_NODEBUG)
313 #define KMC_NOMAGAZINE (1 << KMC_BIT_NOMAGAZINE)
314 #define KMC_NOHASH (1 << KMC_BIT_NOHASH)
315 #define KMC_QCACHE (1 << KMC_BIT_QCACHE)
316 #define KMC_KMEM (1 << KMC_BIT_KMEM)
317 #define KMC_VMEM (1 << KMC_BIT_VMEM)
318 #define KMC_SLAB (1 << KMC_BIT_SLAB)
319 #define KMC_OFFSLAB (1 << KMC_BIT_OFFSLAB)
320 #define KMC_NOEMERGENCY (1 << KMC_BIT_NOEMERGENCY)
321 #define KMC_DEADLOCKED (1 << KMC_BIT_DEADLOCKED)
322 #define KMC_GROWING (1 << KMC_BIT_GROWING)
323 #define KMC_REAPING (1 << KMC_BIT_REAPING)
324 #define KMC_DESTROY (1 << KMC_BIT_DESTROY)
325 #define KMC_TOTAL (1 << KMC_BIT_TOTAL)
326 #define KMC_ALLOC (1 << KMC_BIT_ALLOC)
327 #define KMC_MAX (1 << KMC_BIT_MAX)
328
329 #define KMC_REAP_CHUNK INT_MAX
330 #define KMC_DEFAULT_SEEKS 1
331
332 #define KMC_EXPIRE_AGE 0x1 /* Due to age */
333 #define KMC_EXPIRE_MEM 0x2 /* Due to low memory */
334
335 #define KMC_RECLAIM_ONCE 0x1 /* Force a single shrinker pass */
336
337 extern unsigned int spl_kmem_cache_expire;
338 extern struct list_head spl_kmem_cache_list;
339 extern struct rw_semaphore spl_kmem_cache_sem;
340
341 #define SKM_MAGIC 0x2e2e2e2e
342 #define SKO_MAGIC 0x20202020
343 #define SKS_MAGIC 0x22222222
344 #define SKC_MAGIC 0x2c2c2c2c
345
346 #define SPL_KMEM_CACHE_DELAY 15 /* Minimum slab release age */
347 #define SPL_KMEM_CACHE_REAP 0 /* Default reap everything */
348 #define SPL_KMEM_CACHE_OBJ_PER_SLAB 16 /* Target objects per slab */
349 #define SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN 1 /* Minimum objects per slab */
350 #define SPL_KMEM_CACHE_ALIGN 8 /* Default object alignment */
351
352 #define POINTER_IS_VALID(p) 0 /* Unimplemented */
353 #define POINTER_INVALIDATE(pp) /* Unimplemented */
354
355 typedef int (*spl_kmem_ctor_t)(void *, void *, int);
356 typedef void (*spl_kmem_dtor_t)(void *, void *);
357 typedef void (*spl_kmem_reclaim_t)(void *);
358
359 typedef struct spl_kmem_magazine {
360 uint32_t skm_magic; /* Sanity magic */
361 uint32_t skm_avail; /* Available objects */
362 uint32_t skm_size; /* Magazine size */
363 uint32_t skm_refill; /* Batch refill size */
364 struct spl_kmem_cache *skm_cache; /* Owned by cache */
365 unsigned long skm_age; /* Last cache access */
366 unsigned int skm_cpu; /* Owned by cpu */
367 void *skm_objs[0]; /* Object pointers */
368 } spl_kmem_magazine_t;
369
370 typedef struct spl_kmem_obj {
371 uint32_t sko_magic; /* Sanity magic */
372 void *sko_addr; /* Buffer address */
373 struct spl_kmem_slab *sko_slab; /* Owned by slab */
374 struct list_head sko_list; /* Free object list linkage */
375 } spl_kmem_obj_t;
376
377 typedef struct spl_kmem_slab {
378 uint32_t sks_magic; /* Sanity magic */
379 uint32_t sks_objs; /* Objects per slab */
380 struct spl_kmem_cache *sks_cache; /* Owned by cache */
381 struct list_head sks_list; /* Slab list linkage */
382 struct list_head sks_free_list; /* Free object list */
383 unsigned long sks_age; /* Last modify jiffie */
384 uint32_t sks_ref; /* Ref count used objects */
385 } spl_kmem_slab_t;
386
387 typedef struct spl_kmem_alloc {
388 struct spl_kmem_cache *ska_cache; /* Owned by cache */
389 int ska_flags; /* Allocation flags */
390 taskq_ent_t ska_tqe; /* Task queue entry */
391 } spl_kmem_alloc_t;
392
393 typedef struct spl_kmem_emergency {
394 struct rb_node ske_node; /* Emergency tree linkage */
395 void *ske_obj; /* Buffer address */
396 } spl_kmem_emergency_t;
397
398 typedef struct spl_kmem_cache {
399 uint32_t skc_magic; /* Sanity magic */
400 uint32_t skc_name_size; /* Name length */
401 char *skc_name; /* Name string */
402 spl_kmem_magazine_t *skc_mag[NR_CPUS]; /* Per-CPU warm cache */
403 uint32_t skc_mag_size; /* Magazine size */
404 uint32_t skc_mag_refill; /* Magazine refill count */
405 spl_kmem_ctor_t skc_ctor; /* Constructor */
406 spl_kmem_dtor_t skc_dtor; /* Destructor */
407 spl_kmem_reclaim_t skc_reclaim; /* Reclaimator */
408 void *skc_private; /* Private data */
409 void *skc_vmp; /* Unused */
410 struct kmem_cache *skc_linux_cache; /* Linux slab cache if used */
411 unsigned long skc_flags; /* Flags */
412 uint32_t skc_obj_size; /* Object size */
413 uint32_t skc_obj_align; /* Object alignment */
414 uint32_t skc_slab_objs; /* Objects per slab */
415 uint32_t skc_slab_size; /* Slab size */
416 uint32_t skc_delay; /* Slab reclaim interval */
417 uint32_t skc_reap; /* Slab reclaim count */
418 atomic_t skc_ref; /* Ref count callers */
419 taskqid_t skc_taskqid; /* Slab reclaim task */
420 struct list_head skc_list; /* List of caches linkage */
421 struct list_head skc_complete_list;/* Completely alloc'ed */
422 struct list_head skc_partial_list; /* Partially alloc'ed */
423 struct rb_root skc_emergency_tree; /* Min sized objects */
424 spinlock_t skc_lock; /* Cache lock */
425 wait_queue_head_t skc_waitq; /* Allocation waiters */
426 uint64_t skc_slab_fail; /* Slab alloc failures */
427 uint64_t skc_slab_create;/* Slab creates */
428 uint64_t skc_slab_destroy;/* Slab destroys */
429 uint64_t skc_slab_total; /* Slab total current */
430 uint64_t skc_slab_alloc; /* Slab alloc current */
431 uint64_t skc_slab_max; /* Slab max historic */
432 uint64_t skc_obj_total; /* Obj total current */
433 uint64_t skc_obj_alloc; /* Obj alloc current */
434 uint64_t skc_obj_max; /* Obj max historic */
435 uint64_t skc_obj_deadlock; /* Obj emergency deadlocks */
436 uint64_t skc_obj_emergency; /* Obj emergency current */
437 uint64_t skc_obj_emergency_max; /* Obj emergency max */
438 } spl_kmem_cache_t;
439 #define kmem_cache_t spl_kmem_cache_t
440
441 extern spl_kmem_cache_t *spl_kmem_cache_create(char *name, size_t size,
442 size_t align, spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor,
443 spl_kmem_reclaim_t reclaim, void *priv, void *vmp, int flags);
444 extern void spl_kmem_cache_set_move(spl_kmem_cache_t *,
445 kmem_cbrc_t (*)(void *, void *, size_t, void *));
446 extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc);
447 extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags);
448 extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj);
449 extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count);
450 extern void spl_kmem_reap(void);
451
452 int spl_kmem_init(void);
453 void spl_kmem_fini(void);
454
455 #define kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags) \
456 spl_kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags)
457 #define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move)
458 #define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc)
459 #define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags)
460 #define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj)
461 #define kmem_cache_reap_now(skc) \
462 spl_kmem_cache_reap_now(skc, skc->skc_reap)
463 #define kmem_reap() spl_kmem_reap()
464 #define kmem_virt(ptr) (((ptr) >= (void *)VMALLOC_START) && \
465 ((ptr) < (void *)VMALLOC_END))
466
467 /*
468 * Allow custom slab allocation flags to be set for KMC_SLAB based caches.
469 * One use for this function is to ensure the __GFP_COMP flag is part of
470 * the default allocation mask which ensures higher order allocations are
471 * properly refcounted. This flag was added to the default ->allocflags
472 * as of Linux 3.11.
473 */
474 static inline void
475 kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags)
476 {
477 if (skc->skc_linux_cache == NULL)
478 return;
479
480 #if defined(HAVE_KMEM_CACHE_ALLOCFLAGS)
481 skc->skc_linux_cache->allocflags |= flags;
482 #elif defined(HAVE_KMEM_CACHE_GFPFLAGS)
483 skc->skc_linux_cache->gfpflags |= flags;
484 #endif
485 }
486
487 #endif /* _SPL_KMEM_H */