]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blob - mm/slab_common.c
mm: memcg: convert vmstat slab counters to bytes
[mirror_ubuntu-jammy-kernel.git] / mm / slab_common.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Slab allocator functions that are independent of the allocator strategy
4 *
5 * (C) 2012 Christoph Lameter <cl@linux.com>
6 */
7 #include <linux/slab.h>
8
9 #include <linux/mm.h>
10 #include <linux/poison.h>
11 #include <linux/interrupt.h>
12 #include <linux/memory.h>
13 #include <linux/cache.h>
14 #include <linux/compiler.h>
15 #include <linux/module.h>
16 #include <linux/cpu.h>
17 #include <linux/uaccess.h>
18 #include <linux/seq_file.h>
19 #include <linux/proc_fs.h>
20 #include <linux/debugfs.h>
21 #include <asm/cacheflush.h>
22 #include <asm/tlbflush.h>
23 #include <asm/page.h>
24 #include <linux/memcontrol.h>
25
26 #define CREATE_TRACE_POINTS
27 #include <trace/events/kmem.h>
28
29 #include "internal.h"
30
31 #include "slab.h"
32
33 enum slab_state slab_state;
34 LIST_HEAD(slab_caches);
35 DEFINE_MUTEX(slab_mutex);
36 struct kmem_cache *kmem_cache;
37
38 #ifdef CONFIG_HARDENED_USERCOPY
39 bool usercopy_fallback __ro_after_init =
40 IS_ENABLED(CONFIG_HARDENED_USERCOPY_FALLBACK);
41 module_param(usercopy_fallback, bool, 0400);
42 MODULE_PARM_DESC(usercopy_fallback,
43 "WARN instead of reject usercopy whitelist violations");
44 #endif
45
46 static LIST_HEAD(slab_caches_to_rcu_destroy);
47 static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work);
48 static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
49 slab_caches_to_rcu_destroy_workfn);
50
51 /*
52 * Set of flags that will prevent slab merging
53 */
54 #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
55 SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \
56 SLAB_FAILSLAB | SLAB_KASAN)
57
58 #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
59 SLAB_CACHE_DMA32 | SLAB_ACCOUNT)
60
61 /*
62 * Merge control. If this is set then no merging of slab caches will occur.
63 */
64 static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT);
65
66 static int __init setup_slab_nomerge(char *str)
67 {
68 slab_nomerge = true;
69 return 1;
70 }
71
72 #ifdef CONFIG_SLUB
73 __setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0);
74 #endif
75
76 __setup("slab_nomerge", setup_slab_nomerge);
77
78 /*
79 * Determine the size of a slab object
80 */
81 unsigned int kmem_cache_size(struct kmem_cache *s)
82 {
83 return s->object_size;
84 }
85 EXPORT_SYMBOL(kmem_cache_size);
86
87 #ifdef CONFIG_DEBUG_VM
88 static int kmem_cache_sanity_check(const char *name, unsigned int size)
89 {
90 if (!name || in_interrupt() || size < sizeof(void *) ||
91 size > KMALLOC_MAX_SIZE) {
92 pr_err("kmem_cache_create(%s) integrity check failed\n", name);
93 return -EINVAL;
94 }
95
96 WARN_ON(strchr(name, ' ')); /* It confuses parsers */
97 return 0;
98 }
99 #else
100 static inline int kmem_cache_sanity_check(const char *name, unsigned int size)
101 {
102 return 0;
103 }
104 #endif
105
106 void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
107 {
108 size_t i;
109
110 for (i = 0; i < nr; i++) {
111 if (s)
112 kmem_cache_free(s, p[i]);
113 else
114 kfree(p[i]);
115 }
116 }
117
118 int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
119 void **p)
120 {
121 size_t i;
122
123 for (i = 0; i < nr; i++) {
124 void *x = p[i] = kmem_cache_alloc(s, flags);
125 if (!x) {
126 __kmem_cache_free_bulk(s, i, p);
127 return 0;
128 }
129 }
130 return i;
131 }
132
133 #ifdef CONFIG_MEMCG_KMEM
134
135 LIST_HEAD(slab_root_caches);
136 static DEFINE_SPINLOCK(memcg_kmem_wq_lock);
137
138 static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref);
139
140 void slab_init_memcg_params(struct kmem_cache *s)
141 {
142 s->memcg_params.root_cache = NULL;
143 RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
144 INIT_LIST_HEAD(&s->memcg_params.children);
145 s->memcg_params.dying = false;
146 }
147
148 static int init_memcg_params(struct kmem_cache *s,
149 struct kmem_cache *root_cache)
150 {
151 struct memcg_cache_array *arr;
152
153 if (root_cache) {
154 int ret = percpu_ref_init(&s->memcg_params.refcnt,
155 kmemcg_cache_shutdown,
156 0, GFP_KERNEL);
157 if (ret)
158 return ret;
159
160 s->memcg_params.root_cache = root_cache;
161 INIT_LIST_HEAD(&s->memcg_params.children_node);
162 INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node);
163 return 0;
164 }
165
166 slab_init_memcg_params(s);
167
168 if (!memcg_nr_cache_ids)
169 return 0;
170
171 arr = kvzalloc(sizeof(struct memcg_cache_array) +
172 memcg_nr_cache_ids * sizeof(void *),
173 GFP_KERNEL);
174 if (!arr)
175 return -ENOMEM;
176
177 RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr);
178 return 0;
179 }
180
181 static void destroy_memcg_params(struct kmem_cache *s)
182 {
183 if (is_root_cache(s)) {
184 kvfree(rcu_access_pointer(s->memcg_params.memcg_caches));
185 } else {
186 mem_cgroup_put(s->memcg_params.memcg);
187 WRITE_ONCE(s->memcg_params.memcg, NULL);
188 percpu_ref_exit(&s->memcg_params.refcnt);
189 }
190 }
191
192 static void free_memcg_params(struct rcu_head *rcu)
193 {
194 struct memcg_cache_array *old;
195
196 old = container_of(rcu, struct memcg_cache_array, rcu);
197 kvfree(old);
198 }
199
200 static int update_memcg_params(struct kmem_cache *s, int new_array_size)
201 {
202 struct memcg_cache_array *old, *new;
203
204 new = kvzalloc(sizeof(struct memcg_cache_array) +
205 new_array_size * sizeof(void *), GFP_KERNEL);
206 if (!new)
207 return -ENOMEM;
208
209 old = rcu_dereference_protected(s->memcg_params.memcg_caches,
210 lockdep_is_held(&slab_mutex));
211 if (old)
212 memcpy(new->entries, old->entries,
213 memcg_nr_cache_ids * sizeof(void *));
214
215 rcu_assign_pointer(s->memcg_params.memcg_caches, new);
216 if (old)
217 call_rcu(&old->rcu, free_memcg_params);
218 return 0;
219 }
220
221 int memcg_update_all_caches(int num_memcgs)
222 {
223 struct kmem_cache *s;
224 int ret = 0;
225
226 mutex_lock(&slab_mutex);
227 list_for_each_entry(s, &slab_root_caches, root_caches_node) {
228 ret = update_memcg_params(s, num_memcgs);
229 /*
230 * Instead of freeing the memory, we'll just leave the caches
231 * up to this point in an updated state.
232 */
233 if (ret)
234 break;
235 }
236 mutex_unlock(&slab_mutex);
237 return ret;
238 }
239
240 void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg)
241 {
242 if (is_root_cache(s)) {
243 list_add(&s->root_caches_node, &slab_root_caches);
244 } else {
245 css_get(&memcg->css);
246 s->memcg_params.memcg = memcg;
247 list_add(&s->memcg_params.children_node,
248 &s->memcg_params.root_cache->memcg_params.children);
249 list_add(&s->memcg_params.kmem_caches_node,
250 &s->memcg_params.memcg->kmem_caches);
251 }
252 }
253
254 static void memcg_unlink_cache(struct kmem_cache *s)
255 {
256 if (is_root_cache(s)) {
257 list_del(&s->root_caches_node);
258 } else {
259 list_del(&s->memcg_params.children_node);
260 list_del(&s->memcg_params.kmem_caches_node);
261 }
262 }
263 #else
264 static inline int init_memcg_params(struct kmem_cache *s,
265 struct kmem_cache *root_cache)
266 {
267 return 0;
268 }
269
270 static inline void destroy_memcg_params(struct kmem_cache *s)
271 {
272 }
273
274 static inline void memcg_unlink_cache(struct kmem_cache *s)
275 {
276 }
277 #endif /* CONFIG_MEMCG_KMEM */
278
279 /*
280 * Figure out what the alignment of the objects will be given a set of
281 * flags, a user specified alignment and the size of the objects.
282 */
283 static unsigned int calculate_alignment(slab_flags_t flags,
284 unsigned int align, unsigned int size)
285 {
286 /*
287 * If the user wants hardware cache aligned objects then follow that
288 * suggestion if the object is sufficiently large.
289 *
290 * The hardware cache alignment cannot override the specified
291 * alignment though. If that is greater then use it.
292 */
293 if (flags & SLAB_HWCACHE_ALIGN) {
294 unsigned int ralign;
295
296 ralign = cache_line_size();
297 while (size <= ralign / 2)
298 ralign /= 2;
299 align = max(align, ralign);
300 }
301
302 if (align < ARCH_SLAB_MINALIGN)
303 align = ARCH_SLAB_MINALIGN;
304
305 return ALIGN(align, sizeof(void *));
306 }
307
308 /*
309 * Find a mergeable slab cache
310 */
311 int slab_unmergeable(struct kmem_cache *s)
312 {
313 if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE))
314 return 1;
315
316 if (!is_root_cache(s))
317 return 1;
318
319 if (s->ctor)
320 return 1;
321
322 if (s->usersize)
323 return 1;
324
325 /*
326 * We may have set a slab to be unmergeable during bootstrap.
327 */
328 if (s->refcount < 0)
329 return 1;
330
331 #ifdef CONFIG_MEMCG_KMEM
332 /*
333 * Skip the dying kmem_cache.
334 */
335 if (s->memcg_params.dying)
336 return 1;
337 #endif
338
339 return 0;
340 }
341
342 struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
343 slab_flags_t flags, const char *name, void (*ctor)(void *))
344 {
345 struct kmem_cache *s;
346
347 if (slab_nomerge)
348 return NULL;
349
350 if (ctor)
351 return NULL;
352
353 size = ALIGN(size, sizeof(void *));
354 align = calculate_alignment(flags, align, size);
355 size = ALIGN(size, align);
356 flags = kmem_cache_flags(size, flags, name, NULL);
357
358 if (flags & SLAB_NEVER_MERGE)
359 return NULL;
360
361 list_for_each_entry_reverse(s, &slab_root_caches, root_caches_node) {
362 if (slab_unmergeable(s))
363 continue;
364
365 if (size > s->size)
366 continue;
367
368 if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME))
369 continue;
370 /*
371 * Check if alignment is compatible.
372 * Courtesy of Adrian Drzewiecki
373 */
374 if ((s->size & ~(align - 1)) != s->size)
375 continue;
376
377 if (s->size - size >= sizeof(void *))
378 continue;
379
380 if (IS_ENABLED(CONFIG_SLAB) && align &&
381 (align > s->align || s->align % align))
382 continue;
383
384 return s;
385 }
386 return NULL;
387 }
388
389 static struct kmem_cache *create_cache(const char *name,
390 unsigned int object_size, unsigned int align,
391 slab_flags_t flags, unsigned int useroffset,
392 unsigned int usersize, void (*ctor)(void *),
393 struct mem_cgroup *memcg, struct kmem_cache *root_cache)
394 {
395 struct kmem_cache *s;
396 int err;
397
398 if (WARN_ON(useroffset + usersize > object_size))
399 useroffset = usersize = 0;
400
401 err = -ENOMEM;
402 s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
403 if (!s)
404 goto out;
405
406 s->name = name;
407 s->size = s->object_size = object_size;
408 s->align = align;
409 s->ctor = ctor;
410 s->useroffset = useroffset;
411 s->usersize = usersize;
412
413 err = init_memcg_params(s, root_cache);
414 if (err)
415 goto out_free_cache;
416
417 err = __kmem_cache_create(s, flags);
418 if (err)
419 goto out_free_cache;
420
421 s->refcount = 1;
422 list_add(&s->list, &slab_caches);
423 memcg_link_cache(s, memcg);
424 out:
425 if (err)
426 return ERR_PTR(err);
427 return s;
428
429 out_free_cache:
430 destroy_memcg_params(s);
431 kmem_cache_free(kmem_cache, s);
432 goto out;
433 }
434
435 /**
436 * kmem_cache_create_usercopy - Create a cache with a region suitable
437 * for copying to userspace
438 * @name: A string which is used in /proc/slabinfo to identify this cache.
439 * @size: The size of objects to be created in this cache.
440 * @align: The required alignment for the objects.
441 * @flags: SLAB flags
442 * @useroffset: Usercopy region offset
443 * @usersize: Usercopy region size
444 * @ctor: A constructor for the objects.
445 *
446 * Cannot be called within a interrupt, but can be interrupted.
447 * The @ctor is run when new pages are allocated by the cache.
448 *
449 * The flags are
450 *
451 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
452 * to catch references to uninitialised memory.
453 *
454 * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check
455 * for buffer overruns.
456 *
457 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
458 * cacheline. This can be beneficial if you're counting cycles as closely
459 * as davem.
460 *
461 * Return: a pointer to the cache on success, NULL on failure.
462 */
463 struct kmem_cache *
464 kmem_cache_create_usercopy(const char *name,
465 unsigned int size, unsigned int align,
466 slab_flags_t flags,
467 unsigned int useroffset, unsigned int usersize,
468 void (*ctor)(void *))
469 {
470 struct kmem_cache *s = NULL;
471 const char *cache_name;
472 int err;
473
474 get_online_cpus();
475 get_online_mems();
476 memcg_get_cache_ids();
477
478 mutex_lock(&slab_mutex);
479
480 err = kmem_cache_sanity_check(name, size);
481 if (err) {
482 goto out_unlock;
483 }
484
485 /* Refuse requests with allocator specific flags */
486 if (flags & ~SLAB_FLAGS_PERMITTED) {
487 err = -EINVAL;
488 goto out_unlock;
489 }
490
491 /*
492 * Some allocators will constraint the set of valid flags to a subset
493 * of all flags. We expect them to define CACHE_CREATE_MASK in this
494 * case, and we'll just provide them with a sanitized version of the
495 * passed flags.
496 */
497 flags &= CACHE_CREATE_MASK;
498
499 /* Fail closed on bad usersize of useroffset values. */
500 if (WARN_ON(!usersize && useroffset) ||
501 WARN_ON(size < usersize || size - usersize < useroffset))
502 usersize = useroffset = 0;
503
504 if (!usersize)
505 s = __kmem_cache_alias(name, size, align, flags, ctor);
506 if (s)
507 goto out_unlock;
508
509 cache_name = kstrdup_const(name, GFP_KERNEL);
510 if (!cache_name) {
511 err = -ENOMEM;
512 goto out_unlock;
513 }
514
515 s = create_cache(cache_name, size,
516 calculate_alignment(flags, align, size),
517 flags, useroffset, usersize, ctor, NULL, NULL);
518 if (IS_ERR(s)) {
519 err = PTR_ERR(s);
520 kfree_const(cache_name);
521 }
522
523 out_unlock:
524 mutex_unlock(&slab_mutex);
525
526 memcg_put_cache_ids();
527 put_online_mems();
528 put_online_cpus();
529
530 if (err) {
531 if (flags & SLAB_PANIC)
532 panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
533 name, err);
534 else {
535 pr_warn("kmem_cache_create(%s) failed with error %d\n",
536 name, err);
537 dump_stack();
538 }
539 return NULL;
540 }
541 return s;
542 }
543 EXPORT_SYMBOL(kmem_cache_create_usercopy);
544
545 /**
546 * kmem_cache_create - Create a cache.
547 * @name: A string which is used in /proc/slabinfo to identify this cache.
548 * @size: The size of objects to be created in this cache.
549 * @align: The required alignment for the objects.
550 * @flags: SLAB flags
551 * @ctor: A constructor for the objects.
552 *
553 * Cannot be called within a interrupt, but can be interrupted.
554 * The @ctor is run when new pages are allocated by the cache.
555 *
556 * The flags are
557 *
558 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
559 * to catch references to uninitialised memory.
560 *
561 * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check
562 * for buffer overruns.
563 *
564 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
565 * cacheline. This can be beneficial if you're counting cycles as closely
566 * as davem.
567 *
568 * Return: a pointer to the cache on success, NULL on failure.
569 */
570 struct kmem_cache *
571 kmem_cache_create(const char *name, unsigned int size, unsigned int align,
572 slab_flags_t flags, void (*ctor)(void *))
573 {
574 return kmem_cache_create_usercopy(name, size, align, flags, 0, 0,
575 ctor);
576 }
577 EXPORT_SYMBOL(kmem_cache_create);
578
579 static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
580 {
581 LIST_HEAD(to_destroy);
582 struct kmem_cache *s, *s2;
583
584 /*
585 * On destruction, SLAB_TYPESAFE_BY_RCU kmem_caches are put on the
586 * @slab_caches_to_rcu_destroy list. The slab pages are freed
587 * through RCU and and the associated kmem_cache are dereferenced
588 * while freeing the pages, so the kmem_caches should be freed only
589 * after the pending RCU operations are finished. As rcu_barrier()
590 * is a pretty slow operation, we batch all pending destructions
591 * asynchronously.
592 */
593 mutex_lock(&slab_mutex);
594 list_splice_init(&slab_caches_to_rcu_destroy, &to_destroy);
595 mutex_unlock(&slab_mutex);
596
597 if (list_empty(&to_destroy))
598 return;
599
600 rcu_barrier();
601
602 list_for_each_entry_safe(s, s2, &to_destroy, list) {
603 #ifdef SLAB_SUPPORTS_SYSFS
604 sysfs_slab_release(s);
605 #else
606 slab_kmem_cache_release(s);
607 #endif
608 }
609 }
610
611 static int shutdown_cache(struct kmem_cache *s)
612 {
613 /* free asan quarantined objects */
614 kasan_cache_shutdown(s);
615
616 if (__kmem_cache_shutdown(s) != 0)
617 return -EBUSY;
618
619 memcg_unlink_cache(s);
620 list_del(&s->list);
621
622 if (s->flags & SLAB_TYPESAFE_BY_RCU) {
623 #ifdef SLAB_SUPPORTS_SYSFS
624 sysfs_slab_unlink(s);
625 #endif
626 list_add_tail(&s->list, &slab_caches_to_rcu_destroy);
627 schedule_work(&slab_caches_to_rcu_destroy_work);
628 } else {
629 #ifdef SLAB_SUPPORTS_SYSFS
630 sysfs_slab_unlink(s);
631 sysfs_slab_release(s);
632 #else
633 slab_kmem_cache_release(s);
634 #endif
635 }
636
637 return 0;
638 }
639
640 #ifdef CONFIG_MEMCG_KMEM
641 /*
642 * memcg_create_kmem_cache - Create a cache for a memory cgroup.
643 * @memcg: The memory cgroup the new cache is for.
644 * @root_cache: The parent of the new cache.
645 *
646 * This function attempts to create a kmem cache that will serve allocation
647 * requests going from @memcg to @root_cache. The new cache inherits properties
648 * from its parent.
649 */
650 void memcg_create_kmem_cache(struct mem_cgroup *memcg,
651 struct kmem_cache *root_cache)
652 {
653 static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
654 struct cgroup_subsys_state *css = &memcg->css;
655 struct memcg_cache_array *arr;
656 struct kmem_cache *s = NULL;
657 char *cache_name;
658 int idx;
659
660 get_online_cpus();
661 get_online_mems();
662
663 mutex_lock(&slab_mutex);
664
665 /*
666 * The memory cgroup could have been offlined while the cache
667 * creation work was pending.
668 */
669 if (memcg->kmem_state != KMEM_ONLINE)
670 goto out_unlock;
671
672 idx = memcg_cache_id(memcg);
673 arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches,
674 lockdep_is_held(&slab_mutex));
675
676 /*
677 * Since per-memcg caches are created asynchronously on first
678 * allocation (see memcg_kmem_get_cache()), several threads can try to
679 * create the same cache, but only one of them may succeed.
680 */
681 if (arr->entries[idx])
682 goto out_unlock;
683
684 cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf));
685 cache_name = kasprintf(GFP_KERNEL, "%s(%llu:%s)", root_cache->name,
686 css->serial_nr, memcg_name_buf);
687 if (!cache_name)
688 goto out_unlock;
689
690 s = create_cache(cache_name, root_cache->object_size,
691 root_cache->align,
692 root_cache->flags & CACHE_CREATE_MASK,
693 root_cache->useroffset, root_cache->usersize,
694 root_cache->ctor, memcg, root_cache);
695 /*
696 * If we could not create a memcg cache, do not complain, because
697 * that's not critical at all as we can always proceed with the root
698 * cache.
699 */
700 if (IS_ERR(s)) {
701 kfree(cache_name);
702 goto out_unlock;
703 }
704
705 /*
706 * Since readers won't lock (see memcg_kmem_get_cache()), we need a
707 * barrier here to ensure nobody will see the kmem_cache partially
708 * initialized.
709 */
710 smp_wmb();
711 arr->entries[idx] = s;
712
713 out_unlock:
714 mutex_unlock(&slab_mutex);
715
716 put_online_mems();
717 put_online_cpus();
718 }
719
720 static void kmemcg_workfn(struct work_struct *work)
721 {
722 struct kmem_cache *s = container_of(work, struct kmem_cache,
723 memcg_params.work);
724
725 get_online_cpus();
726 get_online_mems();
727
728 mutex_lock(&slab_mutex);
729 s->memcg_params.work_fn(s);
730 mutex_unlock(&slab_mutex);
731
732 put_online_mems();
733 put_online_cpus();
734 }
735
736 static void kmemcg_rcufn(struct rcu_head *head)
737 {
738 struct kmem_cache *s = container_of(head, struct kmem_cache,
739 memcg_params.rcu_head);
740
741 /*
742 * We need to grab blocking locks. Bounce to ->work. The
743 * work item shares the space with the RCU head and can't be
744 * initialized earlier.
745 */
746 INIT_WORK(&s->memcg_params.work, kmemcg_workfn);
747 queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
748 }
749
750 static void kmemcg_cache_shutdown_fn(struct kmem_cache *s)
751 {
752 WARN_ON(shutdown_cache(s));
753 }
754
755 static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref)
756 {
757 struct kmem_cache *s = container_of(percpu_ref, struct kmem_cache,
758 memcg_params.refcnt);
759 unsigned long flags;
760
761 spin_lock_irqsave(&memcg_kmem_wq_lock, flags);
762 if (s->memcg_params.root_cache->memcg_params.dying)
763 goto unlock;
764
765 s->memcg_params.work_fn = kmemcg_cache_shutdown_fn;
766 INIT_WORK(&s->memcg_params.work, kmemcg_workfn);
767 queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
768
769 unlock:
770 spin_unlock_irqrestore(&memcg_kmem_wq_lock, flags);
771 }
772
773 static void kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
774 {
775 __kmemcg_cache_deactivate_after_rcu(s);
776 percpu_ref_kill(&s->memcg_params.refcnt);
777 }
778
779 static void kmemcg_cache_deactivate(struct kmem_cache *s)
780 {
781 if (WARN_ON_ONCE(is_root_cache(s)))
782 return;
783
784 __kmemcg_cache_deactivate(s);
785 s->flags |= SLAB_DEACTIVATED;
786
787 /*
788 * memcg_kmem_wq_lock is used to synchronize memcg_params.dying
789 * flag and make sure that no new kmem_cache deactivation tasks
790 * are queued (see flush_memcg_workqueue() ).
791 */
792 spin_lock_irq(&memcg_kmem_wq_lock);
793 if (s->memcg_params.root_cache->memcg_params.dying)
794 goto unlock;
795
796 s->memcg_params.work_fn = kmemcg_cache_deactivate_after_rcu;
797 call_rcu(&s->memcg_params.rcu_head, kmemcg_rcufn);
798 unlock:
799 spin_unlock_irq(&memcg_kmem_wq_lock);
800 }
801
802 void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg,
803 struct mem_cgroup *parent)
804 {
805 int idx;
806 struct memcg_cache_array *arr;
807 struct kmem_cache *s, *c;
808 unsigned int nr_reparented;
809
810 idx = memcg_cache_id(memcg);
811
812 get_online_cpus();
813 get_online_mems();
814
815 mutex_lock(&slab_mutex);
816 list_for_each_entry(s, &slab_root_caches, root_caches_node) {
817 arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
818 lockdep_is_held(&slab_mutex));
819 c = arr->entries[idx];
820 if (!c)
821 continue;
822
823 kmemcg_cache_deactivate(c);
824 arr->entries[idx] = NULL;
825 }
826 nr_reparented = 0;
827 list_for_each_entry(s, &memcg->kmem_caches,
828 memcg_params.kmem_caches_node) {
829 WRITE_ONCE(s->memcg_params.memcg, parent);
830 css_put(&memcg->css);
831 nr_reparented++;
832 }
833 if (nr_reparented) {
834 list_splice_init(&memcg->kmem_caches,
835 &parent->kmem_caches);
836 css_get_many(&parent->css, nr_reparented);
837 }
838 mutex_unlock(&slab_mutex);
839
840 put_online_mems();
841 put_online_cpus();
842 }
843
844 static int shutdown_memcg_caches(struct kmem_cache *s)
845 {
846 struct memcg_cache_array *arr;
847 struct kmem_cache *c, *c2;
848 LIST_HEAD(busy);
849 int i;
850
851 BUG_ON(!is_root_cache(s));
852
853 /*
854 * First, shutdown active caches, i.e. caches that belong to online
855 * memory cgroups.
856 */
857 arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
858 lockdep_is_held(&slab_mutex));
859 for_each_memcg_cache_index(i) {
860 c = arr->entries[i];
861 if (!c)
862 continue;
863 if (shutdown_cache(c))
864 /*
865 * The cache still has objects. Move it to a temporary
866 * list so as not to try to destroy it for a second
867 * time while iterating over inactive caches below.
868 */
869 list_move(&c->memcg_params.children_node, &busy);
870 else
871 /*
872 * The cache is empty and will be destroyed soon. Clear
873 * the pointer to it in the memcg_caches array so that
874 * it will never be accessed even if the root cache
875 * stays alive.
876 */
877 arr->entries[i] = NULL;
878 }
879
880 /*
881 * Second, shutdown all caches left from memory cgroups that are now
882 * offline.
883 */
884 list_for_each_entry_safe(c, c2, &s->memcg_params.children,
885 memcg_params.children_node)
886 shutdown_cache(c);
887
888 list_splice(&busy, &s->memcg_params.children);
889
890 /*
891 * A cache being destroyed must be empty. In particular, this means
892 * that all per memcg caches attached to it must be empty too.
893 */
894 if (!list_empty(&s->memcg_params.children))
895 return -EBUSY;
896 return 0;
897 }
898
899 static void memcg_set_kmem_cache_dying(struct kmem_cache *s)
900 {
901 spin_lock_irq(&memcg_kmem_wq_lock);
902 s->memcg_params.dying = true;
903 spin_unlock_irq(&memcg_kmem_wq_lock);
904 }
905
906 static void flush_memcg_workqueue(struct kmem_cache *s)
907 {
908 /*
909 * SLAB and SLUB deactivate the kmem_caches through call_rcu. Make
910 * sure all registered rcu callbacks have been invoked.
911 */
912 rcu_barrier();
913
914 /*
915 * SLAB and SLUB create memcg kmem_caches through workqueue and SLUB
916 * deactivates the memcg kmem_caches through workqueue. Make sure all
917 * previous workitems on workqueue are processed.
918 */
919 if (likely(memcg_kmem_cache_wq))
920 flush_workqueue(memcg_kmem_cache_wq);
921
922 /*
923 * If we're racing with children kmem_cache deactivation, it might
924 * take another rcu grace period to complete their destruction.
925 * At this moment the corresponding percpu_ref_kill() call should be
926 * done, but it might take another rcu grace period to complete
927 * switching to the atomic mode.
928 * Please, note that we check without grabbing the slab_mutex. It's safe
929 * because at this moment the children list can't grow.
930 */
931 if (!list_empty(&s->memcg_params.children))
932 rcu_barrier();
933 }
934 #else
935 static inline int shutdown_memcg_caches(struct kmem_cache *s)
936 {
937 return 0;
938 }
939 #endif /* CONFIG_MEMCG_KMEM */
940
941 void slab_kmem_cache_release(struct kmem_cache *s)
942 {
943 __kmem_cache_release(s);
944 destroy_memcg_params(s);
945 kfree_const(s->name);
946 kmem_cache_free(kmem_cache, s);
947 }
948
949 void kmem_cache_destroy(struct kmem_cache *s)
950 {
951 int err;
952
953 if (unlikely(!s))
954 return;
955
956 get_online_cpus();
957 get_online_mems();
958
959 mutex_lock(&slab_mutex);
960
961 s->refcount--;
962 if (s->refcount)
963 goto out_unlock;
964
965 #ifdef CONFIG_MEMCG_KMEM
966 memcg_set_kmem_cache_dying(s);
967
968 mutex_unlock(&slab_mutex);
969
970 put_online_mems();
971 put_online_cpus();
972
973 flush_memcg_workqueue(s);
974
975 get_online_cpus();
976 get_online_mems();
977
978 mutex_lock(&slab_mutex);
979 #endif
980
981 err = shutdown_memcg_caches(s);
982 if (!err)
983 err = shutdown_cache(s);
984
985 if (err) {
986 pr_err("kmem_cache_destroy %s: Slab cache still has objects\n",
987 s->name);
988 dump_stack();
989 }
990 out_unlock:
991 mutex_unlock(&slab_mutex);
992
993 put_online_mems();
994 put_online_cpus();
995 }
996 EXPORT_SYMBOL(kmem_cache_destroy);
997
998 /**
999 * kmem_cache_shrink - Shrink a cache.
1000 * @cachep: The cache to shrink.
1001 *
1002 * Releases as many slabs as possible for a cache.
1003 * To help debugging, a zero exit status indicates all slabs were released.
1004 *
1005 * Return: %0 if all slabs were released, non-zero otherwise
1006 */
1007 int kmem_cache_shrink(struct kmem_cache *cachep)
1008 {
1009 int ret;
1010
1011 get_online_cpus();
1012 get_online_mems();
1013 kasan_cache_shrink(cachep);
1014 ret = __kmem_cache_shrink(cachep);
1015 put_online_mems();
1016 put_online_cpus();
1017 return ret;
1018 }
1019 EXPORT_SYMBOL(kmem_cache_shrink);
1020
1021 /**
1022 * kmem_cache_shrink_all - shrink a cache and all memcg caches for root cache
1023 * @s: The cache pointer
1024 */
1025 void kmem_cache_shrink_all(struct kmem_cache *s)
1026 {
1027 struct kmem_cache *c;
1028
1029 if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || !is_root_cache(s)) {
1030 kmem_cache_shrink(s);
1031 return;
1032 }
1033
1034 get_online_cpus();
1035 get_online_mems();
1036 kasan_cache_shrink(s);
1037 __kmem_cache_shrink(s);
1038
1039 /*
1040 * We have to take the slab_mutex to protect from the memcg list
1041 * modification.
1042 */
1043 mutex_lock(&slab_mutex);
1044 for_each_memcg_cache(c, s) {
1045 /*
1046 * Don't need to shrink deactivated memcg caches.
1047 */
1048 if (s->flags & SLAB_DEACTIVATED)
1049 continue;
1050 kasan_cache_shrink(c);
1051 __kmem_cache_shrink(c);
1052 }
1053 mutex_unlock(&slab_mutex);
1054 put_online_mems();
1055 put_online_cpus();
1056 }
1057
1058 bool slab_is_available(void)
1059 {
1060 return slab_state >= UP;
1061 }
1062
1063 #ifndef CONFIG_SLOB
1064 /* Create a cache during boot when no slab services are available yet */
1065 void __init create_boot_cache(struct kmem_cache *s, const char *name,
1066 unsigned int size, slab_flags_t flags,
1067 unsigned int useroffset, unsigned int usersize)
1068 {
1069 int err;
1070 unsigned int align = ARCH_KMALLOC_MINALIGN;
1071
1072 s->name = name;
1073 s->size = s->object_size = size;
1074
1075 /*
1076 * For power of two sizes, guarantee natural alignment for kmalloc
1077 * caches, regardless of SL*B debugging options.
1078 */
1079 if (is_power_of_2(size))
1080 align = max(align, size);
1081 s->align = calculate_alignment(flags, align, size);
1082
1083 s->useroffset = useroffset;
1084 s->usersize = usersize;
1085
1086 slab_init_memcg_params(s);
1087
1088 err = __kmem_cache_create(s, flags);
1089
1090 if (err)
1091 panic("Creation of kmalloc slab %s size=%u failed. Reason %d\n",
1092 name, size, err);
1093
1094 s->refcount = -1; /* Exempt from merging for now */
1095 }
1096
1097 struct kmem_cache *__init create_kmalloc_cache(const char *name,
1098 unsigned int size, slab_flags_t flags,
1099 unsigned int useroffset, unsigned int usersize)
1100 {
1101 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
1102
1103 if (!s)
1104 panic("Out of memory when creating slab %s\n", name);
1105
1106 create_boot_cache(s, name, size, flags, useroffset, usersize);
1107 list_add(&s->list, &slab_caches);
1108 memcg_link_cache(s, NULL);
1109 s->refcount = 1;
1110 return s;
1111 }
1112
1113 struct kmem_cache *
1114 kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init =
1115 { /* initialization for https://bugs.llvm.org/show_bug.cgi?id=42570 */ };
1116 EXPORT_SYMBOL(kmalloc_caches);
1117
1118 /*
1119 * Conversion table for small slabs sizes / 8 to the index in the
1120 * kmalloc array. This is necessary for slabs < 192 since we have non power
1121 * of two cache sizes there. The size of larger slabs can be determined using
1122 * fls.
1123 */
1124 static u8 size_index[24] __ro_after_init = {
1125 3, /* 8 */
1126 4, /* 16 */
1127 5, /* 24 */
1128 5, /* 32 */
1129 6, /* 40 */
1130 6, /* 48 */
1131 6, /* 56 */
1132 6, /* 64 */
1133 1, /* 72 */
1134 1, /* 80 */
1135 1, /* 88 */
1136 1, /* 96 */
1137 7, /* 104 */
1138 7, /* 112 */
1139 7, /* 120 */
1140 7, /* 128 */
1141 2, /* 136 */
1142 2, /* 144 */
1143 2, /* 152 */
1144 2, /* 160 */
1145 2, /* 168 */
1146 2, /* 176 */
1147 2, /* 184 */
1148 2 /* 192 */
1149 };
1150
1151 static inline unsigned int size_index_elem(unsigned int bytes)
1152 {
1153 return (bytes - 1) / 8;
1154 }
1155
1156 /*
1157 * Find the kmem_cache structure that serves a given size of
1158 * allocation
1159 */
1160 struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
1161 {
1162 unsigned int index;
1163
1164 if (size <= 192) {
1165 if (!size)
1166 return ZERO_SIZE_PTR;
1167
1168 index = size_index[size_index_elem(size)];
1169 } else {
1170 if (WARN_ON_ONCE(size > KMALLOC_MAX_CACHE_SIZE))
1171 return NULL;
1172 index = fls(size - 1);
1173 }
1174
1175 return kmalloc_caches[kmalloc_type(flags)][index];
1176 }
1177
1178 #ifdef CONFIG_ZONE_DMA
1179 #define INIT_KMALLOC_INFO(__size, __short_size) \
1180 { \
1181 .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \
1182 .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \
1183 .name[KMALLOC_DMA] = "dma-kmalloc-" #__short_size, \
1184 .size = __size, \
1185 }
1186 #else
1187 #define INIT_KMALLOC_INFO(__size, __short_size) \
1188 { \
1189 .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \
1190 .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \
1191 .size = __size, \
1192 }
1193 #endif
1194
1195 /*
1196 * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
1197 * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is
1198 * kmalloc-67108864.
1199 */
1200 const struct kmalloc_info_struct kmalloc_info[] __initconst = {
1201 INIT_KMALLOC_INFO(0, 0),
1202 INIT_KMALLOC_INFO(96, 96),
1203 INIT_KMALLOC_INFO(192, 192),
1204 INIT_KMALLOC_INFO(8, 8),
1205 INIT_KMALLOC_INFO(16, 16),
1206 INIT_KMALLOC_INFO(32, 32),
1207 INIT_KMALLOC_INFO(64, 64),
1208 INIT_KMALLOC_INFO(128, 128),
1209 INIT_KMALLOC_INFO(256, 256),
1210 INIT_KMALLOC_INFO(512, 512),
1211 INIT_KMALLOC_INFO(1024, 1k),
1212 INIT_KMALLOC_INFO(2048, 2k),
1213 INIT_KMALLOC_INFO(4096, 4k),
1214 INIT_KMALLOC_INFO(8192, 8k),
1215 INIT_KMALLOC_INFO(16384, 16k),
1216 INIT_KMALLOC_INFO(32768, 32k),
1217 INIT_KMALLOC_INFO(65536, 64k),
1218 INIT_KMALLOC_INFO(131072, 128k),
1219 INIT_KMALLOC_INFO(262144, 256k),
1220 INIT_KMALLOC_INFO(524288, 512k),
1221 INIT_KMALLOC_INFO(1048576, 1M),
1222 INIT_KMALLOC_INFO(2097152, 2M),
1223 INIT_KMALLOC_INFO(4194304, 4M),
1224 INIT_KMALLOC_INFO(8388608, 8M),
1225 INIT_KMALLOC_INFO(16777216, 16M),
1226 INIT_KMALLOC_INFO(33554432, 32M),
1227 INIT_KMALLOC_INFO(67108864, 64M)
1228 };
1229
1230 /*
1231 * Patch up the size_index table if we have strange large alignment
1232 * requirements for the kmalloc array. This is only the case for
1233 * MIPS it seems. The standard arches will not generate any code here.
1234 *
1235 * Largest permitted alignment is 256 bytes due to the way we
1236 * handle the index determination for the smaller caches.
1237 *
1238 * Make sure that nothing crazy happens if someone starts tinkering
1239 * around with ARCH_KMALLOC_MINALIGN
1240 */
1241 void __init setup_kmalloc_cache_index_table(void)
1242 {
1243 unsigned int i;
1244
1245 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
1246 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
1247
1248 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
1249 unsigned int elem = size_index_elem(i);
1250
1251 if (elem >= ARRAY_SIZE(size_index))
1252 break;
1253 size_index[elem] = KMALLOC_SHIFT_LOW;
1254 }
1255
1256 if (KMALLOC_MIN_SIZE >= 64) {
1257 /*
1258 * The 96 byte size cache is not used if the alignment
1259 * is 64 byte.
1260 */
1261 for (i = 64 + 8; i <= 96; i += 8)
1262 size_index[size_index_elem(i)] = 7;
1263
1264 }
1265
1266 if (KMALLOC_MIN_SIZE >= 128) {
1267 /*
1268 * The 192 byte sized cache is not used if the alignment
1269 * is 128 byte. Redirect kmalloc to use the 256 byte cache
1270 * instead.
1271 */
1272 for (i = 128 + 8; i <= 192; i += 8)
1273 size_index[size_index_elem(i)] = 8;
1274 }
1275 }
1276
1277 static void __init
1278 new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags)
1279 {
1280 if (type == KMALLOC_RECLAIM)
1281 flags |= SLAB_RECLAIM_ACCOUNT;
1282
1283 kmalloc_caches[type][idx] = create_kmalloc_cache(
1284 kmalloc_info[idx].name[type],
1285 kmalloc_info[idx].size, flags, 0,
1286 kmalloc_info[idx].size);
1287 }
1288
1289 /*
1290 * Create the kmalloc array. Some of the regular kmalloc arrays
1291 * may already have been created because they were needed to
1292 * enable allocations for slab creation.
1293 */
1294 void __init create_kmalloc_caches(slab_flags_t flags)
1295 {
1296 int i;
1297 enum kmalloc_cache_type type;
1298
1299 for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
1300 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
1301 if (!kmalloc_caches[type][i])
1302 new_kmalloc_cache(i, type, flags);
1303
1304 /*
1305 * Caches that are not of the two-to-the-power-of size.
1306 * These have to be created immediately after the
1307 * earlier power of two caches
1308 */
1309 if (KMALLOC_MIN_SIZE <= 32 && i == 6 &&
1310 !kmalloc_caches[type][1])
1311 new_kmalloc_cache(1, type, flags);
1312 if (KMALLOC_MIN_SIZE <= 64 && i == 7 &&
1313 !kmalloc_caches[type][2])
1314 new_kmalloc_cache(2, type, flags);
1315 }
1316 }
1317
1318 /* Kmalloc array is now usable */
1319 slab_state = UP;
1320
1321 #ifdef CONFIG_ZONE_DMA
1322 for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
1323 struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i];
1324
1325 if (s) {
1326 kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache(
1327 kmalloc_info[i].name[KMALLOC_DMA],
1328 kmalloc_info[i].size,
1329 SLAB_CACHE_DMA | flags, 0,
1330 kmalloc_info[i].size);
1331 }
1332 }
1333 #endif
1334 }
1335 #endif /* !CONFIG_SLOB */
1336
1337 gfp_t kmalloc_fix_flags(gfp_t flags)
1338 {
1339 gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
1340
1341 flags &= ~GFP_SLAB_BUG_MASK;
1342 pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
1343 invalid_mask, &invalid_mask, flags, &flags);
1344 dump_stack();
1345
1346 return flags;
1347 }
1348
1349 /*
1350 * To avoid unnecessary overhead, we pass through large allocation requests
1351 * directly to the page allocator. We use __GFP_COMP, because we will need to
1352 * know the allocation order to free the pages properly in kfree.
1353 */
1354 void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
1355 {
1356 void *ret = NULL;
1357 struct page *page;
1358
1359 if (unlikely(flags & GFP_SLAB_BUG_MASK))
1360 flags = kmalloc_fix_flags(flags);
1361
1362 flags |= __GFP_COMP;
1363 page = alloc_pages(flags, order);
1364 if (likely(page)) {
1365 ret = page_address(page);
1366 mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
1367 PAGE_SIZE << order);
1368 }
1369 ret = kasan_kmalloc_large(ret, size, flags);
1370 /* As ret might get tagged, call kmemleak hook after KASAN. */
1371 kmemleak_alloc(ret, size, 1, flags);
1372 return ret;
1373 }
1374 EXPORT_SYMBOL(kmalloc_order);
1375
1376 #ifdef CONFIG_TRACING
1377 void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
1378 {
1379 void *ret = kmalloc_order(size, flags, order);
1380 trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
1381 return ret;
1382 }
1383 EXPORT_SYMBOL(kmalloc_order_trace);
1384 #endif
1385
1386 #ifdef CONFIG_SLAB_FREELIST_RANDOM
1387 /* Randomize a generic freelist */
1388 static void freelist_randomize(struct rnd_state *state, unsigned int *list,
1389 unsigned int count)
1390 {
1391 unsigned int rand;
1392 unsigned int i;
1393
1394 for (i = 0; i < count; i++)
1395 list[i] = i;
1396
1397 /* Fisher-Yates shuffle */
1398 for (i = count - 1; i > 0; i--) {
1399 rand = prandom_u32_state(state);
1400 rand %= (i + 1);
1401 swap(list[i], list[rand]);
1402 }
1403 }
1404
1405 /* Create a random sequence per cache */
1406 int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count,
1407 gfp_t gfp)
1408 {
1409 struct rnd_state state;
1410
1411 if (count < 2 || cachep->random_seq)
1412 return 0;
1413
1414 cachep->random_seq = kcalloc(count, sizeof(unsigned int), gfp);
1415 if (!cachep->random_seq)
1416 return -ENOMEM;
1417
1418 /* Get best entropy at this stage of boot */
1419 prandom_seed_state(&state, get_random_long());
1420
1421 freelist_randomize(&state, cachep->random_seq, count);
1422 return 0;
1423 }
1424
1425 /* Destroy the per-cache random freelist sequence */
1426 void cache_random_seq_destroy(struct kmem_cache *cachep)
1427 {
1428 kfree(cachep->random_seq);
1429 cachep->random_seq = NULL;
1430 }
1431 #endif /* CONFIG_SLAB_FREELIST_RANDOM */
1432
1433 #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
1434 #ifdef CONFIG_SLAB
1435 #define SLABINFO_RIGHTS (0600)
1436 #else
1437 #define SLABINFO_RIGHTS (0400)
1438 #endif
1439
1440 static void print_slabinfo_header(struct seq_file *m)
1441 {
1442 /*
1443 * Output format version, so at least we can change it
1444 * without _too_ many complaints.
1445 */
1446 #ifdef CONFIG_DEBUG_SLAB
1447 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
1448 #else
1449 seq_puts(m, "slabinfo - version: 2.1\n");
1450 #endif
1451 seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
1452 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
1453 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
1454 #ifdef CONFIG_DEBUG_SLAB
1455 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> <error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
1456 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
1457 #endif
1458 seq_putc(m, '\n');
1459 }
1460
1461 void *slab_start(struct seq_file *m, loff_t *pos)
1462 {
1463 mutex_lock(&slab_mutex);
1464 return seq_list_start(&slab_root_caches, *pos);
1465 }
1466
1467 void *slab_next(struct seq_file *m, void *p, loff_t *pos)
1468 {
1469 return seq_list_next(p, &slab_root_caches, pos);
1470 }
1471
1472 void slab_stop(struct seq_file *m, void *p)
1473 {
1474 mutex_unlock(&slab_mutex);
1475 }
1476
1477 static void
1478 memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
1479 {
1480 struct kmem_cache *c;
1481 struct slabinfo sinfo;
1482
1483 if (!is_root_cache(s))
1484 return;
1485
1486 for_each_memcg_cache(c, s) {
1487 memset(&sinfo, 0, sizeof(sinfo));
1488 get_slabinfo(c, &sinfo);
1489
1490 info->active_slabs += sinfo.active_slabs;
1491 info->num_slabs += sinfo.num_slabs;
1492 info->shared_avail += sinfo.shared_avail;
1493 info->active_objs += sinfo.active_objs;
1494 info->num_objs += sinfo.num_objs;
1495 }
1496 }
1497
1498 static void cache_show(struct kmem_cache *s, struct seq_file *m)
1499 {
1500 struct slabinfo sinfo;
1501
1502 memset(&sinfo, 0, sizeof(sinfo));
1503 get_slabinfo(s, &sinfo);
1504
1505 memcg_accumulate_slabinfo(s, &sinfo);
1506
1507 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
1508 cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size,
1509 sinfo.objects_per_slab, (1 << sinfo.cache_order));
1510
1511 seq_printf(m, " : tunables %4u %4u %4u",
1512 sinfo.limit, sinfo.batchcount, sinfo.shared);
1513 seq_printf(m, " : slabdata %6lu %6lu %6lu",
1514 sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
1515 slabinfo_show_stats(m, s);
1516 seq_putc(m, '\n');
1517 }
1518
1519 static int slab_show(struct seq_file *m, void *p)
1520 {
1521 struct kmem_cache *s = list_entry(p, struct kmem_cache, root_caches_node);
1522
1523 if (p == slab_root_caches.next)
1524 print_slabinfo_header(m);
1525 cache_show(s, m);
1526 return 0;
1527 }
1528
1529 void dump_unreclaimable_slab(void)
1530 {
1531 struct kmem_cache *s, *s2;
1532 struct slabinfo sinfo;
1533
1534 /*
1535 * Here acquiring slab_mutex is risky since we don't prefer to get
1536 * sleep in oom path. But, without mutex hold, it may introduce a
1537 * risk of crash.
1538 * Use mutex_trylock to protect the list traverse, dump nothing
1539 * without acquiring the mutex.
1540 */
1541 if (!mutex_trylock(&slab_mutex)) {
1542 pr_warn("excessive unreclaimable slab but cannot dump stats\n");
1543 return;
1544 }
1545
1546 pr_info("Unreclaimable slab info:\n");
1547 pr_info("Name Used Total\n");
1548
1549 list_for_each_entry_safe(s, s2, &slab_caches, list) {
1550 if (!is_root_cache(s) || (s->flags & SLAB_RECLAIM_ACCOUNT))
1551 continue;
1552
1553 get_slabinfo(s, &sinfo);
1554
1555 if (sinfo.num_objs > 0)
1556 pr_info("%-17s %10luKB %10luKB\n", cache_name(s),
1557 (sinfo.active_objs * s->size) / 1024,
1558 (sinfo.num_objs * s->size) / 1024);
1559 }
1560 mutex_unlock(&slab_mutex);
1561 }
1562
1563 #if defined(CONFIG_MEMCG_KMEM)
1564 void *memcg_slab_start(struct seq_file *m, loff_t *pos)
1565 {
1566 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
1567
1568 mutex_lock(&slab_mutex);
1569 return seq_list_start(&memcg->kmem_caches, *pos);
1570 }
1571
1572 void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos)
1573 {
1574 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
1575
1576 return seq_list_next(p, &memcg->kmem_caches, pos);
1577 }
1578
1579 void memcg_slab_stop(struct seq_file *m, void *p)
1580 {
1581 mutex_unlock(&slab_mutex);
1582 }
1583
1584 int memcg_slab_show(struct seq_file *m, void *p)
1585 {
1586 struct kmem_cache *s = list_entry(p, struct kmem_cache,
1587 memcg_params.kmem_caches_node);
1588 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
1589
1590 if (p == memcg->kmem_caches.next)
1591 print_slabinfo_header(m);
1592 cache_show(s, m);
1593 return 0;
1594 }
1595 #endif
1596
1597 /*
1598 * slabinfo_op - iterator that generates /proc/slabinfo
1599 *
1600 * Output layout:
1601 * cache-name
1602 * num-active-objs
1603 * total-objs
1604 * object size
1605 * num-active-slabs
1606 * total-slabs
1607 * num-pages-per-slab
1608 * + further values on SMP and with statistics enabled
1609 */
1610 static const struct seq_operations slabinfo_op = {
1611 .start = slab_start,
1612 .next = slab_next,
1613 .stop = slab_stop,
1614 .show = slab_show,
1615 };
1616
1617 static int slabinfo_open(struct inode *inode, struct file *file)
1618 {
1619 return seq_open(file, &slabinfo_op);
1620 }
1621
1622 static const struct proc_ops slabinfo_proc_ops = {
1623 .proc_flags = PROC_ENTRY_PERMANENT,
1624 .proc_open = slabinfo_open,
1625 .proc_read = seq_read,
1626 .proc_write = slabinfo_write,
1627 .proc_lseek = seq_lseek,
1628 .proc_release = seq_release,
1629 };
1630
1631 static int __init slab_proc_init(void)
1632 {
1633 proc_create("slabinfo", SLABINFO_RIGHTS, NULL, &slabinfo_proc_ops);
1634 return 0;
1635 }
1636 module_init(slab_proc_init);
1637
1638 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_MEMCG_KMEM)
1639 /*
1640 * Display information about kmem caches that have child memcg caches.
1641 */
1642 static int memcg_slabinfo_show(struct seq_file *m, void *unused)
1643 {
1644 struct kmem_cache *s, *c;
1645 struct slabinfo sinfo;
1646
1647 mutex_lock(&slab_mutex);
1648 seq_puts(m, "# <name> <css_id[:dead|deact]> <active_objs> <num_objs>");
1649 seq_puts(m, " <active_slabs> <num_slabs>\n");
1650 list_for_each_entry(s, &slab_root_caches, root_caches_node) {
1651 /*
1652 * Skip kmem caches that don't have any memcg children.
1653 */
1654 if (list_empty(&s->memcg_params.children))
1655 continue;
1656
1657 memset(&sinfo, 0, sizeof(sinfo));
1658 get_slabinfo(s, &sinfo);
1659 seq_printf(m, "%-17s root %6lu %6lu %6lu %6lu\n",
1660 cache_name(s), sinfo.active_objs, sinfo.num_objs,
1661 sinfo.active_slabs, sinfo.num_slabs);
1662
1663 for_each_memcg_cache(c, s) {
1664 struct cgroup_subsys_state *css;
1665 char *status = "";
1666
1667 css = &c->memcg_params.memcg->css;
1668 if (!(css->flags & CSS_ONLINE))
1669 status = ":dead";
1670 else if (c->flags & SLAB_DEACTIVATED)
1671 status = ":deact";
1672
1673 memset(&sinfo, 0, sizeof(sinfo));
1674 get_slabinfo(c, &sinfo);
1675 seq_printf(m, "%-17s %4d%-6s %6lu %6lu %6lu %6lu\n",
1676 cache_name(c), css->id, status,
1677 sinfo.active_objs, sinfo.num_objs,
1678 sinfo.active_slabs, sinfo.num_slabs);
1679 }
1680 }
1681 mutex_unlock(&slab_mutex);
1682 return 0;
1683 }
1684 DEFINE_SHOW_ATTRIBUTE(memcg_slabinfo);
1685
1686 static int __init memcg_slabinfo_init(void)
1687 {
1688 debugfs_create_file("memcg_slabinfo", S_IFREG | S_IRUGO,
1689 NULL, NULL, &memcg_slabinfo_fops);
1690 return 0;
1691 }
1692
1693 late_initcall(memcg_slabinfo_init);
1694 #endif /* CONFIG_DEBUG_FS && CONFIG_MEMCG_KMEM */
1695 #endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */
1696
1697 static __always_inline void *__do_krealloc(const void *p, size_t new_size,
1698 gfp_t flags)
1699 {
1700 void *ret;
1701 size_t ks;
1702
1703 ks = ksize(p);
1704
1705 if (ks >= new_size) {
1706 p = kasan_krealloc((void *)p, new_size, flags);
1707 return (void *)p;
1708 }
1709
1710 ret = kmalloc_track_caller(new_size, flags);
1711 if (ret && p)
1712 memcpy(ret, p, ks);
1713
1714 return ret;
1715 }
1716
1717 /**
1718 * krealloc - reallocate memory. The contents will remain unchanged.
1719 * @p: object to reallocate memory for.
1720 * @new_size: how many bytes of memory are required.
1721 * @flags: the type of memory to allocate.
1722 *
1723 * The contents of the object pointed to are preserved up to the
1724 * lesser of the new and old sizes. If @p is %NULL, krealloc()
1725 * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a
1726 * %NULL pointer, the object pointed to is freed.
1727 *
1728 * Return: pointer to the allocated memory or %NULL in case of error
1729 */
1730 void *krealloc(const void *p, size_t new_size, gfp_t flags)
1731 {
1732 void *ret;
1733
1734 if (unlikely(!new_size)) {
1735 kfree(p);
1736 return ZERO_SIZE_PTR;
1737 }
1738
1739 ret = __do_krealloc(p, new_size, flags);
1740 if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret))
1741 kfree(p);
1742
1743 return ret;
1744 }
1745 EXPORT_SYMBOL(krealloc);
1746
1747 /**
1748 * kfree_sensitive - Clear sensitive information in memory before freeing
1749 * @p: object to free memory of
1750 *
1751 * The memory of the object @p points to is zeroed before freed.
1752 * If @p is %NULL, kfree_sensitive() does nothing.
1753 *
1754 * Note: this function zeroes the whole allocated buffer which can be a good
1755 * deal bigger than the requested buffer size passed to kmalloc(). So be
1756 * careful when using this function in performance sensitive code.
1757 */
1758 void kfree_sensitive(const void *p)
1759 {
1760 size_t ks;
1761 void *mem = (void *)p;
1762
1763 ks = ksize(mem);
1764 if (ks)
1765 memzero_explicit(mem, ks);
1766 kfree(mem);
1767 }
1768 EXPORT_SYMBOL(kfree_sensitive);
1769
1770 /**
1771 * ksize - get the actual amount of memory allocated for a given object
1772 * @objp: Pointer to the object
1773 *
1774 * kmalloc may internally round up allocations and return more memory
1775 * than requested. ksize() can be used to determine the actual amount of
1776 * memory allocated. The caller may use this additional memory, even though
1777 * a smaller amount of memory was initially specified with the kmalloc call.
1778 * The caller must guarantee that objp points to a valid object previously
1779 * allocated with either kmalloc() or kmem_cache_alloc(). The object
1780 * must not be freed during the duration of the call.
1781 *
1782 * Return: size of the actual memory used by @objp in bytes
1783 */
1784 size_t ksize(const void *objp)
1785 {
1786 size_t size;
1787
1788 /*
1789 * We need to check that the pointed to object is valid, and only then
1790 * unpoison the shadow memory below. We use __kasan_check_read(), to
1791 * generate a more useful report at the time ksize() is called (rather
1792 * than later where behaviour is undefined due to potential
1793 * use-after-free or double-free).
1794 *
1795 * If the pointed to memory is invalid we return 0, to avoid users of
1796 * ksize() writing to and potentially corrupting the memory region.
1797 *
1798 * We want to perform the check before __ksize(), to avoid potentially
1799 * crashing in __ksize() due to accessing invalid metadata.
1800 */
1801 if (unlikely(ZERO_OR_NULL_PTR(objp)) || !__kasan_check_read(objp, 1))
1802 return 0;
1803
1804 size = __ksize(objp);
1805 /*
1806 * We assume that ksize callers could use whole allocated area,
1807 * so we need to unpoison this area.
1808 */
1809 kasan_unpoison_shadow(objp, size);
1810 return size;
1811 }
1812 EXPORT_SYMBOL(ksize);
1813
1814 /* Tracepoints definitions. */
1815 EXPORT_TRACEPOINT_SYMBOL(kmalloc);
1816 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
1817 EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
1818 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node);
1819 EXPORT_TRACEPOINT_SYMBOL(kfree);
1820 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
1821
1822 int should_failslab(struct kmem_cache *s, gfp_t gfpflags)
1823 {
1824 if (__should_failslab(s, gfpflags))
1825 return -ENOMEM;
1826 return 0;
1827 }
1828 ALLOW_ERROR_INJECTION(should_failslab, ERRNO);