lib/stackdepot.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Stack depot - a stack trace storage that avoids duplication.
   4  *
   5  * Internally, stack depot maintains a hash table of unique stacktraces. The
   6  * stack traces themselves are stored contiguously one after another in a set
   7  * of separate page allocations.
   8  *
   9  * Author: Alexander Potapenko <glider@google.com>
  10  * Copyright (C) 2016 Google, Inc.
  11  *
  12  * Based on the code by Dmitry Chernenkov.
  13  */
  14
  15 #define pr_fmt(fmt) "stackdepot: " fmt
  16
  17 #include <linux/gfp.h>
  18 #include <linux/jhash.h>
  19 #include <linux/kernel.h>
  20 #include <linux/kmsan.h>
  21 #include <linux/mm.h>
  22 #include <linux/mutex.h>
  23 #include <linux/percpu.h>
  24 #include <linux/printk.h>
  25 #include <linux/slab.h>
  26 #include <linux/stacktrace.h>
  27 #include <linux/stackdepot.h>
  28 #include <linux/string.h>
  29 #include <linux/types.h>
  30 #include <linux/memblock.h>
  31 #include <linux/kasan-enabled.h>
  32
  33 #define DEPOT_HANDLE_BITS (sizeof(depot_stack_handle_t) * 8)
  34
  35 #define DEPOT_VALID_BITS 1
  36 #define DEPOT_POOL_ORDER 2 /* Pool size order, 4 pages */
  37 #define DEPOT_POOL_SIZE (1LL << (PAGE_SHIFT + DEPOT_POOL_ORDER))
  38 #define DEPOT_STACK_ALIGN 4
  39 #define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN)
  40 #define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_VALID_BITS - \
  41                                DEPOT_OFFSET_BITS - STACK_DEPOT_EXTRA_BITS)
  42 #define DEPOT_POOLS_CAP 8192
  43 #define DEPOT_MAX_POOLS \
  44         (((1LL << (DEPOT_POOL_INDEX_BITS)) < DEPOT_POOLS_CAP) ? \
  45          (1LL << (DEPOT_POOL_INDEX_BITS)) : DEPOT_POOLS_CAP)
  46
  47 /* Compact structure that stores a reference to a stack. */
  48 union handle_parts {
  49         depot_stack_handle_t handle;
  50         struct {
  51                 u32 pool_index  : DEPOT_POOL_INDEX_BITS;
  52                 u32 offset      : DEPOT_OFFSET_BITS;
  53                 u32 valid       : DEPOT_VALID_BITS;
  54                 u32 extra       : STACK_DEPOT_EXTRA_BITS;
  55         };
  56 };
  57
  58 struct stack_record {
  59         struct stack_record *next;      /* Link in the hash table */
  60         u32 hash;                       /* Hash in the hash table */
  61         u32 size;                       /* Number of stored frames */
  62         union handle_parts handle;
  63         unsigned long entries[];        /* Variable-sized array of frames */
  64 };
  65
  66 static bool stack_depot_disabled;
  67 static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT);
  68 static bool __stack_depot_early_init_passed __initdata;
  69
  70 /* Use one hash table bucket per 16 KB of memory. */
  71 #define STACK_HASH_TABLE_SCALE 14
  72 /* Limit the number of buckets between 4K and 1M. */
  73 #define STACK_BUCKET_NUMBER_ORDER_MIN 12
  74 #define STACK_BUCKET_NUMBER_ORDER_MAX 20
  75 /* Initial seed for jhash2. */
  76 #define STACK_HASH_SEED 0x9747b28c
  77
  78 /* Hash table of pointers to stored stack traces. */
  79 static struct stack_record **stack_table;
  80 /* Fixed order of the number of table buckets. Used when KASAN is enabled. */
  81 static unsigned int stack_bucket_number_order;
  82 /* Hash mask for indexing the table. */
  83 static unsigned int stack_hash_mask;
  84
  85 /* Array of memory regions that store stack traces. */
  86 static void *stack_pools[DEPOT_MAX_POOLS];
  87 /* Currently used pool in stack_pools. */
  88 static int pool_index;
  89 /* Offset to the unused space in the currently used pool. */
  90 static size_t pool_offset;
  91 /* Lock that protects the variables above. */
  92 static DEFINE_RAW_SPINLOCK(pool_lock);
  93 /*
  94  * Stack depot tries to keep an extra pool allocated even before it runs out
  95  * of space in the currently used pool.
  96  * This flag marks that this next extra pool needs to be allocated and
  97  * initialized. It has the value 0 when either the next pool is not yet
  98  * initialized or the limit on the number of pools is reached.
  99  */
 100 static int next_pool_required = 1;
 101
 102 static int __init disable_stack_depot(char *str)
 103 {
 104         int ret;
 105
 106         ret = kstrtobool(str, &stack_depot_disabled);
 107         if (!ret && stack_depot_disabled) {
 108                 pr_info("disabled\n");
 109                 stack_table = NULL;
 110         }
 111         return 0;
 112 }
 113 early_param("stack_depot_disable", disable_stack_depot);
 114
 115 void __init stack_depot_request_early_init(void)
 116 {
 117         /* Too late to request early init now. */
 118         WARN_ON(__stack_depot_early_init_passed);
 119
 120         __stack_depot_early_init_requested = true;
 121 }
 122
 123 /* Allocates a hash table via memblock. Can only be used during early boot. */
 124 int __init stack_depot_early_init(void)
 125 {
 126         unsigned long entries = 0;
 127
 128         /* This function must be called only once, from mm_init(). */
 129         if (WARN_ON(__stack_depot_early_init_passed))
 130                 return 0;
 131         __stack_depot_early_init_passed = true;
 132
 133         /*
 134          * If KASAN is enabled, use the maximum order: KASAN is frequently used
 135          * in fuzzing scenarios, which leads to a large number of different
 136          * stack traces being stored in stack depot.
 137          */
 138         if (kasan_enabled() && !stack_bucket_number_order)
 139                 stack_bucket_number_order = STACK_BUCKET_NUMBER_ORDER_MAX;
 140
 141         if (!__stack_depot_early_init_requested || stack_depot_disabled)
 142                 return 0;
 143
 144         /*
 145          * If stack_bucket_number_order is not set, leave entries as 0 to rely
 146          * on the automatic calculations performed by alloc_large_system_hash.
 147          */
 148         if (stack_bucket_number_order)
 149                 entries = 1UL << stack_bucket_number_order;
 150         pr_info("allocating hash table via alloc_large_system_hash\n");
 151         stack_table = alloc_large_system_hash("stackdepot",
 152                                                 sizeof(struct stack_record *),
 153                                                 entries,
 154                                                 STACK_HASH_TABLE_SCALE,
 155                                                 HASH_EARLY | HASH_ZERO,
 156                                                 NULL,
 157                                                 &stack_hash_mask,
 158                                                 1UL << STACK_BUCKET_NUMBER_ORDER_MIN,
 159                                                 1UL << STACK_BUCKET_NUMBER_ORDER_MAX);
 160         if (!stack_table) {
 161                 pr_err("hash table allocation failed, disabling\n");
 162                 stack_depot_disabled = true;
 163                 return -ENOMEM;
 164         }
 165
 166         return 0;
 167 }
 168
 169 /* Allocates a hash table via kvcalloc. Can be used after boot. */
 170 int stack_depot_init(void)
 171 {
 172         static DEFINE_MUTEX(stack_depot_init_mutex);
 173         unsigned long entries;
 174         int ret = 0;
 175
 176         mutex_lock(&stack_depot_init_mutex);
 177
 178         if (stack_depot_disabled || stack_table)
 179                 goto out_unlock;
 180
 181         /*
 182          * Similarly to stack_depot_early_init, use stack_bucket_number_order
 183          * if assigned, and rely on automatic scaling otherwise.
 184          */
 185         if (stack_bucket_number_order) {
 186                 entries = 1UL << stack_bucket_number_order;
 187         } else {
 188                 int scale = STACK_HASH_TABLE_SCALE;
 189
 190                 entries = nr_free_buffer_pages();
 191                 entries = roundup_pow_of_two(entries);
 192
 193                 if (scale > PAGE_SHIFT)
 194                         entries >>= (scale - PAGE_SHIFT);
 195                 else
 196                         entries <<= (PAGE_SHIFT - scale);
 197         }
 198
 199         if (entries < 1UL << STACK_BUCKET_NUMBER_ORDER_MIN)
 200                 entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MIN;
 201         if (entries > 1UL << STACK_BUCKET_NUMBER_ORDER_MAX)
 202                 entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MAX;
 203
 204         pr_info("allocating hash table of %lu entries via kvcalloc\n", entries);
 205         stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL);
 206         if (!stack_table) {
 207                 pr_err("hash table allocation failed, disabling\n");
 208                 stack_depot_disabled = true;
 209                 ret = -ENOMEM;
 210                 goto out_unlock;
 211         }
 212         stack_hash_mask = entries - 1;
 213
 214 out_unlock:
 215         mutex_unlock(&stack_depot_init_mutex);
 216
 217         return ret;
 218 }
 219 EXPORT_SYMBOL_GPL(stack_depot_init);
 220
 221 /* Uses preallocated memory to initialize a new stack depot pool. */
 222 static void depot_init_pool(void **prealloc)
 223 {
 224         /*
 225          * If the next pool is already initialized or the maximum number of
 226          * pools is reached, do not use the preallocated memory.
 227          * smp_load_acquire() here pairs with smp_store_release() below and
 228          * in depot_alloc_stack().
 229          */
 230         if (!smp_load_acquire(&next_pool_required))
 231                 return;
 232
 233         /* Check if the current pool is not yet allocated. */
 234         if (stack_pools[pool_index] == NULL) {
 235                 /* Use the preallocated memory for the current pool. */
 236                 stack_pools[pool_index] = *prealloc;
 237                 *prealloc = NULL;
 238         } else {
 239                 /*
 240                  * Otherwise, use the preallocated memory for the next pool
 241                  * as long as we do not exceed the maximum number of pools.
 242                  */
 243                 if (pool_index + 1 < DEPOT_MAX_POOLS) {
 244                         stack_pools[pool_index + 1] = *prealloc;
 245                         *prealloc = NULL;
 246                 }
 247                 /*
 248                  * At this point, either the next pool is initialized or the
 249                  * maximum number of pools is reached. In either case, take
 250                  * note that initializing another pool is not required.
 251                  * This smp_store_release pairs with smp_load_acquire() above
 252                  * and in stack_depot_save().
 253                  */
 254                 smp_store_release(&next_pool_required, 0);
 255         }
 256 }
 257
 258 /* Allocates a new stack in a stack depot pool. */
 259 static struct stack_record *
 260 depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc)
 261 {
 262         struct stack_record *stack;
 263         size_t required_size = struct_size(stack, entries, size);
 264
 265         required_size = ALIGN(required_size, 1 << DEPOT_STACK_ALIGN);
 266
 267         /* Check if there is not enough space in the current pool. */
 268         if (unlikely(pool_offset + required_size > DEPOT_POOL_SIZE)) {
 269                 /* Bail out if we reached the pool limit. */
 270                 if (unlikely(pool_index + 1 >= DEPOT_MAX_POOLS)) {
 271                         WARN_ONCE(1, "Stack depot reached limit capacity");
 272                         return NULL;
 273                 }
 274
 275                 /*
 276                  * Move on to the next pool.
 277                  * WRITE_ONCE pairs with potential concurrent read in
 278                  * stack_depot_fetch().
 279                  */
 280                 WRITE_ONCE(pool_index, pool_index + 1);
 281                 pool_offset = 0;
 282                 /*
 283                  * If the maximum number of pools is not reached, take note
 284                  * that the next pool needs to initialized.
 285                  * smp_store_release() here pairs with smp_load_acquire() in
 286                  * stack_depot_save() and depot_init_pool().
 287                  */
 288                 if (pool_index + 1 < DEPOT_MAX_POOLS)
 289                         smp_store_release(&next_pool_required, 1);
 290         }
 291
 292         /* Assign the preallocated memory to a pool if required. */
 293         if (*prealloc)
 294                 depot_init_pool(prealloc);
 295
 296         /* Check if we have a pool to save the stack trace. */
 297         if (stack_pools[pool_index] == NULL)
 298                 return NULL;
 299
 300         /* Save the stack trace. */
 301         stack = stack_pools[pool_index] + pool_offset;
 302         stack->hash = hash;
 303         stack->size = size;
 304         stack->handle.pool_index = pool_index;
 305         stack->handle.offset = pool_offset >> DEPOT_STACK_ALIGN;
 306         stack->handle.valid = 1;
 307         stack->handle.extra = 0;
 308         memcpy(stack->entries, entries, flex_array_size(stack, entries, size));
 309         pool_offset += required_size;
 310         /*
 311          * Let KMSAN know the stored stack record is initialized. This shall
 312          * prevent false positive reports if instrumented code accesses it.
 313          */
 314         kmsan_unpoison_memory(stack, required_size);
 315
 316         return stack;
 317 }
 318
 319 /* Calculates the hash for a stack. */
 320 static inline u32 hash_stack(unsigned long *entries, unsigned int size)
 321 {
 322         return jhash2((u32 *)entries,
 323                       array_size(size,  sizeof(*entries)) / sizeof(u32),
 324                       STACK_HASH_SEED);
 325 }
 326
 327 /*
 328  * Non-instrumented version of memcmp().
 329  * Does not check the lexicographical order, only the equality.
 330  */
 331 static inline
 332 int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2,
 333                         unsigned int n)
 334 {
 335         for ( ; n-- ; u1++, u2++) {
 336                 if (*u1 != *u2)
 337                         return 1;
 338         }
 339         return 0;
 340 }
 341
 342 /* Finds a stack in a bucket of the hash table. */
 343 static inline struct stack_record *find_stack(struct stack_record *bucket,
 344                                              unsigned long *entries, int size,
 345                                              u32 hash)
 346 {
 347         struct stack_record *found;
 348
 349         for (found = bucket; found; found = found->next) {
 350                 if (found->hash == hash &&
 351                     found->size == size &&
 352                     !stackdepot_memcmp(entries, found->entries, size))
 353                         return found;
 354         }
 355         return NULL;
 356 }
 357
 358 depot_stack_handle_t __stack_depot_save(unsigned long *entries,
 359                                         unsigned int nr_entries,
 360                                         gfp_t alloc_flags, bool can_alloc)
 361 {
 362         struct stack_record *found = NULL, **bucket;
 363         union handle_parts retval = { .handle = 0 };
 364         struct page *page = NULL;
 365         void *prealloc = NULL;
 366         unsigned long flags;
 367         u32 hash;
 368
 369         /*
 370          * If this stack trace is from an interrupt, including anything before
 371          * interrupt entry usually leads to unbounded stack depot growth.
 372          *
 373          * Since use of filter_irq_stacks() is a requirement to ensure stack
 374          * depot can efficiently deduplicate interrupt stacks, always
 375          * filter_irq_stacks() to simplify all callers' use of stack depot.
 376          */
 377         nr_entries = filter_irq_stacks(entries, nr_entries);
 378
 379         if (unlikely(nr_entries == 0) || stack_depot_disabled)
 380                 goto fast_exit;
 381
 382         hash = hash_stack(entries, nr_entries);
 383         bucket = &stack_table[hash & stack_hash_mask];
 384
 385         /*
 386          * Fast path: look the stack trace up without locking.
 387          * The smp_load_acquire() here pairs with smp_store_release() to
 388          * |bucket| below.
 389          */
 390         found = find_stack(smp_load_acquire(bucket), entries, nr_entries, hash);
 391         if (found)
 392                 goto exit;
 393
 394         /*
 395          * Check if another stack pool needs to be initialized. If so, allocate
 396          * the memory now - we won't be able to do that under the lock.
 397          *
 398          * The smp_load_acquire() here pairs with smp_store_release() to
 399          * |next_pool_inited| in depot_alloc_stack() and depot_init_pool().
 400          */
 401         if (unlikely(can_alloc && smp_load_acquire(&next_pool_required))) {
 402                 /*
 403                  * Zero out zone modifiers, as we don't have specific zone
 404                  * requirements. Keep the flags related to allocation in atomic
 405                  * contexts and I/O.
 406                  */
 407                 alloc_flags &= ~GFP_ZONEMASK;
 408                 alloc_flags &= (GFP_ATOMIC | GFP_KERNEL);
 409                 alloc_flags |= __GFP_NOWARN;
 410                 page = alloc_pages(alloc_flags, DEPOT_POOL_ORDER);
 411                 if (page)
 412                         prealloc = page_address(page);
 413         }
 414
 415         raw_spin_lock_irqsave(&pool_lock, flags);
 416
 417         found = find_stack(*bucket, entries, nr_entries, hash);
 418         if (!found) {
 419                 struct stack_record *new =
 420                         depot_alloc_stack(entries, nr_entries, hash, &prealloc);
 421
 422                 if (new) {
 423                         new->next = *bucket;
 424                         /*
 425                          * This smp_store_release() pairs with
 426                          * smp_load_acquire() from |bucket| above.
 427                          */
 428                         smp_store_release(bucket, new);
 429                         found = new;
 430                 }
 431         } else if (prealloc) {
 432                 /*
 433                  * Stack depot already contains this stack trace, but let's
 434                  * keep the preallocated memory for the future.
 435                  */
 436                 depot_init_pool(&prealloc);
 437         }
 438
 439         raw_spin_unlock_irqrestore(&pool_lock, flags);
 440 exit:
 441         if (prealloc) {
 442                 /* Stack depot didn't use this memory, free it. */
 443                 free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER);
 444         }
 445         if (found)
 446                 retval.handle = found->handle.handle;
 447 fast_exit:
 448         return retval.handle;
 449 }
 450 EXPORT_SYMBOL_GPL(__stack_depot_save);
 451
 452 depot_stack_handle_t stack_depot_save(unsigned long *entries,
 453                                       unsigned int nr_entries,
 454                                       gfp_t alloc_flags)
 455 {
 456         return __stack_depot_save(entries, nr_entries, alloc_flags, true);
 457 }
 458 EXPORT_SYMBOL_GPL(stack_depot_save);
 459
 460 unsigned int stack_depot_fetch(depot_stack_handle_t handle,
 461                                unsigned long **entries)
 462 {
 463         union handle_parts parts = { .handle = handle };
 464         /*
 465          * READ_ONCE pairs with potential concurrent write in
 466          * depot_alloc_stack.
 467          */
 468         int pool_index_cached = READ_ONCE(pool_index);
 469         void *pool;
 470         size_t offset = parts.offset << DEPOT_STACK_ALIGN;
 471         struct stack_record *stack;
 472
 473         *entries = NULL;
 474         /*
 475          * Let KMSAN know *entries is initialized. This shall prevent false
 476          * positive reports if instrumented code accesses it.
 477          */
 478         kmsan_unpoison_memory(entries, sizeof(*entries));
 479
 480         if (!handle)
 481                 return 0;
 482
 483         if (parts.pool_index > pool_index_cached) {
 484                 WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n",
 485                         parts.pool_index, pool_index_cached, handle);
 486                 return 0;
 487         }
 488         pool = stack_pools[parts.pool_index];
 489         if (!pool)
 490                 return 0;
 491         stack = pool + offset;
 492
 493         *entries = stack->entries;
 494         return stack->size;
 495 }
 496 EXPORT_SYMBOL_GPL(stack_depot_fetch);
 497
 498 void stack_depot_print(depot_stack_handle_t stack)
 499 {
 500         unsigned long *entries;
 501         unsigned int nr_entries;
 502
 503         nr_entries = stack_depot_fetch(stack, &entries);
 504         if (nr_entries > 0)
 505                 stack_trace_print(entries, nr_entries, 0);
 506 }
 507 EXPORT_SYMBOL_GPL(stack_depot_print);
 508
 509 int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size,
 510                        int spaces)
 511 {
 512         unsigned long *entries;
 513         unsigned int nr_entries;
 514
 515         nr_entries = stack_depot_fetch(handle, &entries);
 516         return nr_entries ? stack_trace_snprint(buf, size, entries, nr_entries,
 517                                                 spaces) : 0;
 518 }
 519 EXPORT_SYMBOL_GPL(stack_depot_snprint);
 520
 521 depot_stack_handle_t __must_check stack_depot_set_extra_bits(
 522                         depot_stack_handle_t handle, unsigned int extra_bits)
 523 {
 524         union handle_parts parts = { .handle = handle };
 525
 526         /* Don't set extra bits on empty handles. */
 527         if (!handle)
 528                 return 0;
 529
 530         parts.extra = extra_bits;
 531         return parts.handle;
 532 }
 533 EXPORT_SYMBOL(stack_depot_set_extra_bits);
 534
 535 unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle)
 536 {
 537         union handle_parts parts = { .handle = handle };
 538
 539         return parts.extra;
 540 }
 541 EXPORT_SYMBOL(stack_depot_get_extra_bits);