module/spl/spl-kmem.c

   1 /*
   2  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
   3  *  Copyright (C) 2007 The Regents of the University of California.
   4  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
   5  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
   6  *  UCRL-CODE-235197
   7  *
   8  *  This file is part of the SPL, Solaris Porting Layer.
   9  *  For details, see <http://zfsonlinux.org/>.
  10  *
  11  *  The SPL is free software; you can redistribute it and/or modify it
  12  *  under the terms of the GNU General Public License as published by the
  13  *  Free Software Foundation; either version 2 of the License, or (at your
  14  *  option) any later version.
  15  *
  16  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  17  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  18  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  19  *  for more details.
  20  *
  21  *  You should have received a copy of the GNU General Public License along
  22  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  23  */
  24
  25 #include <sys/debug.h>
  26 #include <sys/sysmacros.h>
  27 #include <sys/kmem.h>
  28 #include <sys/vmem.h>
  29 #include <linux/mm.h>
  30 #include <linux/ratelimit.h>
  31
  32 /*
  33  * As a general rule kmem_alloc() allocations should be small, preferably
  34  * just a few pages since they must by physically contiguous.  Therefore, a
  35  * rate limited warning will be printed to the console for any kmem_alloc()
  36  * which exceeds a reasonable threshold.
  37  *
  38  * The default warning threshold is set to sixteen pages but capped at 64K to
  39  * accommodate systems using large pages.  This value was selected to be small
  40  * enough to ensure the largest allocations are quickly noticed and fixed.
  41  * But large enough to avoid logging any warnings when a allocation size is
  42  * larger than optimal but not a serious concern.  Since this value is tunable,
  43  * developers are encouraged to set it lower when testing so any new largish
  44  * allocations are quickly caught.  These warnings may be disabled by setting
  45  * the threshold to zero.
  46  */
  47 /* BEGIN CSTYLED */
  48 unsigned int spl_kmem_alloc_warn = MIN(16 * PAGE_SIZE, 64 * 1024);
  49 module_param(spl_kmem_alloc_warn, uint, 0644);
  50 MODULE_PARM_DESC(spl_kmem_alloc_warn,
  51         "Warning threshold in bytes for a kmem_alloc()");
  52 EXPORT_SYMBOL(spl_kmem_alloc_warn);
  53
  54 /*
  55  * Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE.
  56  * Allocations which are marginally smaller than this limit may succeed but
  57  * should still be avoided due to the expense of locating a contiguous range
  58  * of free pages.  Therefore, a maximum kmem size with reasonable safely
  59  * margin of 4x is set.  Kmem_alloc() allocations larger than this maximum
  60  * will quickly fail.  Vmem_alloc() allocations less than or equal to this
  61  * value will use kmalloc(), but shift to vmalloc() when exceeding this value.
  62  */
  63 unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2);
  64 module_param(spl_kmem_alloc_max, uint, 0644);
  65 MODULE_PARM_DESC(spl_kmem_alloc_max,
  66         "Maximum size in bytes for a kmem_alloc()");
  67 EXPORT_SYMBOL(spl_kmem_alloc_max);
  68 /* END CSTYLED */
  69
  70 int
  71 kmem_debugging(void)
  72 {
  73         return (0);
  74 }
  75 EXPORT_SYMBOL(kmem_debugging);
  76
  77 char *
  78 kmem_vasprintf(const char *fmt, va_list ap)
  79 {
  80         va_list aq;
  81         char *ptr;
  82
  83         do {
  84                 va_copy(aq, ap);
  85                 ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, aq);
  86                 va_end(aq);
  87         } while (ptr == NULL);
  88
  89         return (ptr);
  90 }
  91 EXPORT_SYMBOL(kmem_vasprintf);
  92
  93 char *
  94 kmem_asprintf(const char *fmt, ...)
  95 {
  96         va_list ap;
  97         char *ptr;
  98
  99         do {
 100                 va_start(ap, fmt);
 101                 ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, ap);
 102                 va_end(ap);
 103         } while (ptr == NULL);
 104
 105         return (ptr);
 106 }
 107 EXPORT_SYMBOL(kmem_asprintf);
 108
 109 static char *
 110 __strdup(const char *str, int flags)
 111 {
 112         char *ptr;
 113         int n;
 114
 115         n = strlen(str);
 116         ptr = kmalloc(n + 1, kmem_flags_convert(flags));
 117         if (ptr)
 118                 memcpy(ptr, str, n + 1);
 119
 120         return (ptr);
 121 }
 122
 123 char *
 124 strdup(const char *str)
 125 {
 126         return (__strdup(str, KM_SLEEP));
 127 }
 128 EXPORT_SYMBOL(strdup);
 129
 130 void
 131 strfree(char *str)
 132 {
 133         kfree(str);
 134 }
 135 EXPORT_SYMBOL(strfree);
 136
 137 /*
 138  * Limit the number of large allocation stack traces dumped to not more than
 139  * 5 every 60 seconds to prevent denial-of-service attacks from debug code.
 140  */
 141 DEFINE_RATELIMIT_STATE(kmem_alloc_ratelimit_state, 60 * HZ, 5);
 142
 143 /*
 144  * General purpose unified implementation of kmem_alloc(). It is an
 145  * amalgamation of Linux and Illumos allocator design. It should never be
 146  * exported to ensure that code using kmem_alloc()/kmem_zalloc() remains
 147  * relatively portable.  Consumers may only access this function through
 148  * wrappers that enforce the common flags to ensure portability.
 149  */
 150 inline void *
 151 spl_kmem_alloc_impl(size_t size, int flags, int node)
 152 {
 153         gfp_t lflags = kmem_flags_convert(flags);
 154         int use_vmem = 0;
 155         void *ptr;
 156
 157         /*
 158          * Log abnormally large allocations and rate limit the console output.
 159          * Allocations larger than spl_kmem_alloc_warn should be performed
 160          * through the vmem_alloc()/vmem_zalloc() interfaces.
 161          */
 162         if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) &&
 163             !(flags & KM_VMEM) && __ratelimit(&kmem_alloc_ratelimit_state)) {
 164                 printk(KERN_WARNING
 165                     "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n"
 166                     "https://github.com/zfsonlinux/zfs/issues/new\n",
 167                     (unsigned long)size, flags);
 168                 dump_stack();
 169         }
 170
 171         /*
 172          * Use a loop because kmalloc_node() can fail when GFP_KERNEL is used
 173          * unlike kmem_alloc() with KM_SLEEP on Illumos.
 174          */
 175         do {
 176                 /*
 177                  * Calling kmalloc_node() when the size >= spl_kmem_alloc_max
 178                  * is unsafe.  This must fail for all for kmem_alloc() and
 179                  * kmem_zalloc() callers.
 180                  *
 181                  * For vmem_alloc() and vmem_zalloc() callers it is permissible
 182                  * to use __vmalloc().  However, in general use of __vmalloc()
 183                  * is strongly discouraged because a global lock must be
 184                  * acquired.  Contention on this lock can significantly
 185                  * impact performance so frequently manipulating the virtual
 186                  * address space is strongly discouraged.
 187                  */
 188                 if ((size > spl_kmem_alloc_max) || use_vmem) {
 189                         if (flags & KM_VMEM) {
 190                                 ptr = __vmalloc(size, lflags, PAGE_KERNEL);
 191                         } else {
 192                                 return (NULL);
 193                         }
 194                 } else {
 195                         ptr = kmalloc_node(size, lflags, node);
 196                 }
 197
 198                 if (likely(ptr) || (flags & KM_NOSLEEP))
 199                         return (ptr);
 200
 201                 /*
 202                  * For vmem_alloc() and vmem_zalloc() callers retry immediately
 203                  * using __vmalloc() which is unlikely to fail.
 204                  */
 205                 if ((flags & KM_VMEM) && (use_vmem == 0))  {
 206                         use_vmem = 1;
 207                         continue;
 208                 }
 209
 210                 if (unlikely(__ratelimit(&kmem_alloc_ratelimit_state))) {
 211                         printk(KERN_WARNING
 212                             "Possible memory allocation deadlock: "
 213                             "size=%lu lflags=0x%x",
 214                             (unsigned long)size, lflags);
 215                         dump_stack();
 216                 }
 217
 218                 /*
 219                  * Use cond_resched() instead of congestion_wait() to avoid
 220                  * deadlocking systems where there are no block devices.
 221                  */
 222                 cond_resched();
 223         } while (1);
 224
 225         return (NULL);
 226 }
 227
 228 inline void
 229 spl_kmem_free_impl(const void *buf, size_t size)
 230 {
 231         if (is_vmalloc_addr(buf))
 232                 vfree(buf);
 233         else
 234                 kfree(buf);
 235 }
 236
 237 /*
 238  * Memory allocation and accounting for kmem_* * style allocations.  When
 239  * DEBUG_KMEM is enabled the total memory allocated will be tracked and
 240  * any memory leaked will be reported during module unload.
 241  *
 242  * ./configure --enable-debug-kmem
 243  */
 244 #ifdef DEBUG_KMEM
 245
 246 /* Shim layer memory accounting */
 247 #ifdef HAVE_ATOMIC64_T
 248 atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
 249 unsigned long long kmem_alloc_max = 0;
 250 #else  /* HAVE_ATOMIC64_T */
 251 atomic_t kmem_alloc_used = ATOMIC_INIT(0);
 252 unsigned long long kmem_alloc_max = 0;
 253 #endif /* HAVE_ATOMIC64_T */
 254
 255 EXPORT_SYMBOL(kmem_alloc_used);
 256 EXPORT_SYMBOL(kmem_alloc_max);
 257
 258 inline void *
 259 spl_kmem_alloc_debug(size_t size, int flags, int node)
 260 {
 261         void *ptr;
 262
 263         ptr = spl_kmem_alloc_impl(size, flags, node);
 264         if (ptr) {
 265                 kmem_alloc_used_add(size);
 266                 if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
 267                         kmem_alloc_max = kmem_alloc_used_read();
 268         }
 269
 270         return (ptr);
 271 }
 272
 273 inline void
 274 spl_kmem_free_debug(const void *ptr, size_t size)
 275 {
 276         kmem_alloc_used_sub(size);
 277         spl_kmem_free_impl(ptr, size);
 278 }
 279
 280 /*
 281  * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked
 282  * but also the location of every alloc and free.  When the SPL module is
 283  * unloaded a list of all leaked addresses and where they were allocated
 284  * will be dumped to the console.  Enabling this feature has a significant
 285  * impact on performance but it makes finding memory leaks straight forward.
 286  *
 287  * Not surprisingly with debugging enabled the xmem_locks are very highly
 288  * contended particularly on xfree().  If we want to run with this detailed
 289  * debugging enabled for anything other than debugging  we need to minimize
 290  * the contention by moving to a lock per xmem_table entry model.
 291  *
 292  * ./configure --enable-debug-kmem-tracking
 293  */
 294 #ifdef DEBUG_KMEM_TRACKING
 295
 296 #include <linux/hash.h>
 297 #include <linux/ctype.h>
 298
 299 #define KMEM_HASH_BITS          10
 300 #define KMEM_TABLE_SIZE         (1 << KMEM_HASH_BITS)
 301
 302 typedef struct kmem_debug {
 303         struct hlist_node kd_hlist;     /* Hash node linkage */
 304         struct list_head kd_list;       /* List of all allocations */
 305         void *kd_addr;                  /* Allocation pointer */
 306         size_t kd_size;                 /* Allocation size */
 307         const char *kd_func;            /* Allocation function */
 308         int kd_line;                    /* Allocation line */
 309 } kmem_debug_t;
 310
 311 static spinlock_t kmem_lock;
 312 static struct hlist_head kmem_table[KMEM_TABLE_SIZE];
 313 static struct list_head kmem_list;
 314
 315 static kmem_debug_t *
 316 kmem_del_init(spinlock_t *lock, struct hlist_head *table,
 317     int bits, const void *addr)
 318 {
 319         struct hlist_head *head;
 320         struct hlist_node *node;
 321         struct kmem_debug *p;
 322         unsigned long flags;
 323
 324         spin_lock_irqsave(lock, flags);
 325
 326         head = &table[hash_ptr((void *)addr, bits)];
 327         hlist_for_each(node, head) {
 328                 p = list_entry(node, struct kmem_debug, kd_hlist);
 329                 if (p->kd_addr == addr) {
 330                         hlist_del_init(&p->kd_hlist);
 331                         list_del_init(&p->kd_list);
 332                         spin_unlock_irqrestore(lock, flags);
 333                         return (p);
 334                 }
 335         }
 336
 337         spin_unlock_irqrestore(lock, flags);
 338
 339         return (NULL);
 340 }
 341
 342 inline void *
 343 spl_kmem_alloc_track(size_t size, int flags,
 344     const char *func, int line, int node)
 345 {
 346         void *ptr = NULL;
 347         kmem_debug_t *dptr;
 348         unsigned long irq_flags;
 349
 350         dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags));
 351         if (dptr == NULL)
 352                 return (NULL);
 353
 354         dptr->kd_func = __strdup(func, flags);
 355         if (dptr->kd_func == NULL) {
 356                 kfree(dptr);
 357                 return (NULL);
 358         }
 359
 360         ptr = spl_kmem_alloc_debug(size, flags, node);
 361         if (ptr == NULL) {
 362                 kfree(dptr->kd_func);
 363                 kfree(dptr);
 364                 return (NULL);
 365         }
 366
 367         INIT_HLIST_NODE(&dptr->kd_hlist);
 368         INIT_LIST_HEAD(&dptr->kd_list);
 369
 370         dptr->kd_addr = ptr;
 371         dptr->kd_size = size;
 372         dptr->kd_line = line;
 373
 374         spin_lock_irqsave(&kmem_lock, irq_flags);
 375         hlist_add_head(&dptr->kd_hlist,
 376             &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
 377         list_add_tail(&dptr->kd_list, &kmem_list);
 378         spin_unlock_irqrestore(&kmem_lock, irq_flags);
 379
 380         return (ptr);
 381 }
 382
 383 inline void
 384 spl_kmem_free_track(const void *ptr, size_t size)
 385 {
 386         kmem_debug_t *dptr;
 387
 388         /* Ignore NULL pointer since we haven't tracked it at all */
 389         if (ptr == NULL)
 390                 return;
 391
 392         /* Must exist in hash due to kmem_alloc() */
 393         dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
 394         ASSERT3P(dptr, !=, NULL);
 395         ASSERT3S(dptr->kd_size, ==, size);
 396
 397         kfree(dptr->kd_func);
 398         kfree(dptr);
 399
 400         spl_kmem_free_debug(ptr, size);
 401 }
 402 #endif /* DEBUG_KMEM_TRACKING */
 403 #endif /* DEBUG_KMEM */
 404
 405 /*
 406  * Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces.
 407  */
 408 void *
 409 spl_kmem_alloc(size_t size, int flags, const char *func, int line)
 410 {
 411         ASSERT0(flags & ~KM_PUBLIC_MASK);
 412
 413 #if !defined(DEBUG_KMEM)
 414         return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
 415 #elif !defined(DEBUG_KMEM_TRACKING)
 416         return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
 417 #else
 418         return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
 419 #endif
 420 }
 421 EXPORT_SYMBOL(spl_kmem_alloc);
 422
 423 void *
 424 spl_kmem_zalloc(size_t size, int flags, const char *func, int line)
 425 {
 426         ASSERT0(flags & ~KM_PUBLIC_MASK);
 427
 428         flags |= KM_ZERO;
 429
 430 #if !defined(DEBUG_KMEM)
 431         return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
 432 #elif !defined(DEBUG_KMEM_TRACKING)
 433         return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
 434 #else
 435         return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
 436 #endif
 437 }
 438 EXPORT_SYMBOL(spl_kmem_zalloc);
 439
 440 void
 441 spl_kmem_free(const void *buf, size_t size)
 442 {
 443 #if !defined(DEBUG_KMEM)
 444         return (spl_kmem_free_impl(buf, size));
 445 #elif !defined(DEBUG_KMEM_TRACKING)
 446         return (spl_kmem_free_debug(buf, size));
 447 #else
 448         return (spl_kmem_free_track(buf, size));
 449 #endif
 450 }
 451 EXPORT_SYMBOL(spl_kmem_free);
 452
 453 #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
 454 static char *
 455 spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
 456 {
 457         int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
 458         int i, flag = 1;
 459
 460         ASSERT(str != NULL && len >= 17);
 461         memset(str, 0, len);
 462
 463         /*
 464          * Check for a fully printable string, and while we are at
 465          * it place the printable characters in the passed buffer.
 466          */
 467         for (i = 0; i < size; i++) {
 468                 str[i] = ((char *)(kd->kd_addr))[i];
 469                 if (isprint(str[i])) {
 470                         continue;
 471                 } else {
 472                         /*
 473                          * Minimum number of printable characters found
 474                          * to make it worthwhile to print this as ascii.
 475                          */
 476                         if (i > min)
 477                                 break;
 478
 479                         flag = 0;
 480                         break;
 481                 }
 482         }
 483
 484         if (!flag) {
 485                 sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
 486                     *((uint8_t *)kd->kd_addr),
 487                     *((uint8_t *)kd->kd_addr + 2),
 488                     *((uint8_t *)kd->kd_addr + 4),
 489                     *((uint8_t *)kd->kd_addr + 6),
 490                     *((uint8_t *)kd->kd_addr + 8),
 491                     *((uint8_t *)kd->kd_addr + 10),
 492                     *((uint8_t *)kd->kd_addr + 12),
 493                     *((uint8_t *)kd->kd_addr + 14));
 494         }
 495
 496         return (str);
 497 }
 498
 499 static int
 500 spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
 501 {
 502         int i;
 503
 504         spin_lock_init(lock);
 505         INIT_LIST_HEAD(list);
 506
 507         for (i = 0; i < size; i++)
 508                 INIT_HLIST_HEAD(&kmem_table[i]);
 509
 510         return (0);
 511 }
 512
 513 static void
 514 spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
 515 {
 516         unsigned long flags;
 517         kmem_debug_t *kd;
 518         char str[17];
 519
 520         spin_lock_irqsave(lock, flags);
 521         if (!list_empty(list))
 522                 printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
 523                     "size", "data", "func", "line");
 524
 525         list_for_each_entry(kd, list, kd_list) {
 526                 printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
 527                     (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
 528                     kd->kd_func, kd->kd_line);
 529         }
 530
 531         spin_unlock_irqrestore(lock, flags);
 532 }
 533 #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
 534
 535 int
 536 spl_kmem_init(void)
 537 {
 538 #ifdef DEBUG_KMEM
 539         kmem_alloc_used_set(0);
 540
 541 #ifdef DEBUG_KMEM_TRACKING
 542         spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
 543 #endif /* DEBUG_KMEM_TRACKING */
 544 #endif /* DEBUG_KMEM */
 545
 546         return (0);
 547 }
 548
 549 void
 550 spl_kmem_fini(void)
 551 {
 552 #ifdef DEBUG_KMEM
 553         /*
 554          * Display all unreclaimed memory addresses, including the
 555          * allocation size and the first few bytes of what's located
 556          * at that address to aid in debugging.  Performance is not
 557          * a serious concern here since it is module unload time.
 558          */
 559         if (kmem_alloc_used_read() != 0)
 560                 printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n",
 561                     (unsigned long)kmem_alloc_used_read(), kmem_alloc_max);
 562
 563 #ifdef DEBUG_KMEM_TRACKING
 564         spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
 565 #endif /* DEBUG_KMEM_TRACKING */
 566 #endif /* DEBUG_KMEM */
 567 }