module/os/linux/zfs/arc_os.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2018, Joyent, Inc.
  24  * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  25  * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
  26  * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
  27  */
  28
  29 #include <sys/spa.h>
  30 #include <sys/zio.h>
  31 #include <sys/spa_impl.h>
  32 #include <sys/zio_compress.h>
  33 #include <sys/zio_checksum.h>
  34 #include <sys/zfs_context.h>
  35 #include <sys/arc.h>
  36 #include <sys/zfs_refcount.h>
  37 #include <sys/vdev.h>
  38 #include <sys/vdev_trim.h>
  39 #include <sys/vdev_impl.h>
  40 #include <sys/dsl_pool.h>
  41 #include <sys/multilist.h>
  42 #include <sys/abd.h>
  43 #include <sys/zil.h>
  44 #include <sys/fm/fs/zfs.h>
  45 #ifdef _KERNEL
  46 #include <sys/shrinker.h>
  47 #include <sys/vmsystm.h>
  48 #include <sys/zpl.h>
  49 #include <linux/page_compat.h>
  50 #include <linux/notifier.h>
  51 #include <linux/memory.h>
  52 #endif
  53 #include <sys/callb.h>
  54 #include <sys/kstat.h>
  55 #include <sys/zthr.h>
  56 #include <zfs_fletcher.h>
  57 #include <sys/arc_impl.h>
  58 #include <sys/trace_zfs.h>
  59 #include <sys/aggsum.h>
  60
  61 /*
  62  * This is a limit on how many pages the ARC shrinker makes available for
  63  * eviction in response to one page allocation attempt.  Note that in
  64  * practice, the kernel's shrinker can ask us to evict up to about 4x this
  65  * for one allocation attempt.
  66  *
  67  * The default limit of 10,000 (in practice, 160MB per allocation attempt
  68  * with 4K pages) limits the amount of time spent attempting to reclaim ARC
  69  * memory to less than 100ms per allocation attempt, even with a small
  70  * average compressed block size of ~8KB.
  71  *
  72  * See also the comment in arc_shrinker_count().
  73  * Set to 0 to disable limit.
  74  */
  75 int zfs_arc_shrinker_limit = 10000;
  76
  77 #ifdef CONFIG_MEMORY_HOTPLUG
  78 static struct notifier_block arc_hotplug_callback_mem_nb;
  79 #endif
  80
  81 /*
  82  * Return a default max arc size based on the amount of physical memory.
  83  * This may be overridden by tuning the zfs_arc_max module parameter.
  84  */
  85 uint64_t
  86 arc_default_max(uint64_t min, uint64_t allmem)
  87 {
  88         uint64_t size;
  89
  90         if (allmem >= 1 << 30)
  91                 size = allmem - (1 << 30);
  92         else
  93                 size = min;
  94         return (MAX(allmem * 5 / 8, size));
  95 }
  96
  97 #ifdef _KERNEL
  98 /*
  99  * Return maximum amount of memory that we could possibly use.  Reduced
 100  * to half of all memory in user space which is primarily used for testing.
 101  */
 102 uint64_t
 103 arc_all_memory(void)
 104 {
 105 #ifdef CONFIG_HIGHMEM
 106         return (ptob(zfs_totalram_pages - zfs_totalhigh_pages));
 107 #else
 108         return (ptob(zfs_totalram_pages));
 109 #endif /* CONFIG_HIGHMEM */
 110 }
 111
 112 /*
 113  * Return the amount of memory that is considered free.  In user space
 114  * which is primarily used for testing we pretend that free memory ranges
 115  * from 0-20% of all memory.
 116  */
 117 uint64_t
 118 arc_free_memory(void)
 119 {
 120 #ifdef CONFIG_HIGHMEM
 121         struct sysinfo si;
 122         si_meminfo(&si);
 123         return (ptob(si.freeram - si.freehigh));
 124 #else
 125         return (ptob(nr_free_pages() +
 126             nr_inactive_file_pages()));
 127 #endif /* CONFIG_HIGHMEM */
 128 }
 129
 130 /*
 131  * Return the amount of memory that can be consumed before reclaim will be
 132  * needed.  Positive if there is sufficient free memory, negative indicates
 133  * the amount of memory that needs to be freed up.
 134  */
 135 int64_t
 136 arc_available_memory(void)
 137 {
 138         return (arc_free_memory() - arc_sys_free);
 139 }
 140
 141 static uint64_t
 142 arc_evictable_memory(void)
 143 {
 144         int64_t asize = aggsum_value(&arc_sums.arcstat_size);
 145         uint64_t arc_clean =
 146             zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_DATA]) +
 147             zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) +
 148             zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_DATA]) +
 149             zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
 150         uint64_t arc_dirty = MAX((int64_t)asize - (int64_t)arc_clean, 0);
 151
 152         /*
 153          * Scale reported evictable memory in proportion to page cache, cap
 154          * at specified min/max.
 155          */
 156         uint64_t min = (ptob(nr_file_pages()) / 100) * zfs_arc_pc_percent;
 157         min = MAX(arc_c_min, MIN(arc_c_max, min));
 158
 159         if (arc_dirty >= min)
 160                 return (arc_clean);
 161
 162         return (MAX((int64_t)asize - (int64_t)min, 0));
 163 }
 164
 165 /*
 166  * The _count() function returns the number of free-able objects.
 167  * The _scan() function returns the number of objects that were freed.
 168  */
 169 static unsigned long
 170 arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
 171 {
 172         /*
 173          * __GFP_FS won't be set if we are called from ZFS code (see
 174          * kmem_flags_convert(), which removes it).  To avoid a deadlock, we
 175          * don't allow evicting in this case.  We return 0 rather than
 176          * SHRINK_STOP so that the shrinker logic doesn't accumulate a
 177          * deficit against us.
 178          */
 179         if (!(sc->gfp_mask & __GFP_FS)) {
 180                 return (0);
 181         }
 182
 183         /*
 184          * This code is reached in the "direct reclaim" case, where the
 185          * kernel (outside ZFS) is trying to allocate a page, and the system
 186          * is low on memory.
 187          *
 188          * The kernel's shrinker code doesn't understand how many pages the
 189          * ARC's callback actually frees, so it may ask the ARC to shrink a
 190          * lot for one page allocation. This is problematic because it may
 191          * take a long time, thus delaying the page allocation, and because
 192          * it may force the ARC to unnecessarily shrink very small.
 193          *
 194          * Therefore, we limit the amount of data that we say is evictable,
 195          * which limits the amount that the shrinker will ask us to evict for
 196          * one page allocation attempt.
 197          *
 198          * In practice, we may be asked to shrink 4x the limit to satisfy one
 199          * page allocation, before the kernel's shrinker code gives up on us.
 200          * When that happens, we rely on the kernel code to find the pages
 201          * that we freed before invoking the OOM killer.  This happens in
 202          * __alloc_pages_slowpath(), which retries and finds the pages we
 203          * freed when it calls get_page_from_freelist().
 204          *
 205          * See also the comment above zfs_arc_shrinker_limit.
 206          */
 207         int64_t limit = zfs_arc_shrinker_limit != 0 ?
 208             zfs_arc_shrinker_limit : INT64_MAX;
 209         return (MIN(limit, btop((int64_t)arc_evictable_memory())));
 210 }
 211
 212 static unsigned long
 213 arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc)
 214 {
 215         ASSERT((sc->gfp_mask & __GFP_FS) != 0);
 216
 217         /* The arc is considered warm once reclaim has occurred */
 218         if (unlikely(arc_warm == B_FALSE))
 219                 arc_warm = B_TRUE;
 220
 221         /*
 222          * Evict the requested number of pages by reducing arc_c and waiting
 223          * for the requested amount of data to be evicted.
 224          */
 225         arc_reduce_target_size(ptob(sc->nr_to_scan));
 226         arc_wait_for_eviction(ptob(sc->nr_to_scan), B_FALSE);
 227         if (current->reclaim_state != NULL)
 228 #ifdef  HAVE_RECLAIM_STATE_RECLAIMED
 229                 current->reclaim_state->reclaimed += sc->nr_to_scan;
 230 #else
 231                 current->reclaim_state->reclaimed_slab += sc->nr_to_scan;
 232 #endif
 233
 234         /*
 235          * We are experiencing memory pressure which the arc_evict_zthr was
 236          * unable to keep up with. Set arc_no_grow to briefly pause arc
 237          * growth to avoid compounding the memory pressure.
 238          */
 239         arc_no_grow = B_TRUE;
 240
 241         /*
 242          * When direct reclaim is observed it usually indicates a rapid
 243          * increase in memory pressure.  This occurs because the kswapd
 244          * threads were unable to asynchronously keep enough free memory
 245          * available.
 246          */
 247         if (current_is_kswapd()) {
 248                 ARCSTAT_BUMP(arcstat_memory_indirect_count);
 249         } else {
 250                 ARCSTAT_BUMP(arcstat_memory_direct_count);
 251         }
 252
 253         return (sc->nr_to_scan);
 254 }
 255
 256 SPL_SHRINKER_DECLARE(arc_shrinker,
 257     arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS);
 258
 259 int
 260 arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
 261 {
 262         uint64_t free_memory = arc_free_memory();
 263
 264         if (free_memory > arc_all_memory() * arc_lotsfree_percent / 100)
 265                 return (0);
 266
 267         if (txg > spa->spa_lowmem_last_txg) {
 268                 spa->spa_lowmem_last_txg = txg;
 269                 spa->spa_lowmem_page_load = 0;
 270         }
 271         /*
 272          * If we are in pageout, we know that memory is already tight,
 273          * the arc is already going to be evicting, so we just want to
 274          * continue to let page writes occur as quickly as possible.
 275          */
 276         if (current_is_kswapd()) {
 277                 if (spa->spa_lowmem_page_load >
 278                     MAX(arc_sys_free / 4, free_memory) / 4) {
 279                         DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
 280                         return (SET_ERROR(ERESTART));
 281                 }
 282                 /* Note: reserve is inflated, so we deflate */
 283                 atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8);
 284                 return (0);
 285         } else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) {
 286                 /* memory is low, delay before restarting */
 287                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
 288                 DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
 289                 return (SET_ERROR(EAGAIN));
 290         }
 291         spa->spa_lowmem_page_load = 0;
 292         return (0);
 293 }
 294
 295 static void
 296 arc_set_sys_free(uint64_t allmem)
 297 {
 298         /*
 299          * The ARC tries to keep at least this much memory available for the
 300          * system.  This gives the ARC time to shrink in response to memory
 301          * pressure, before running completely out of memory and invoking the
 302          * direct-reclaim ARC shrinker.
 303          *
 304          * This should be more than twice high_wmark_pages(), so that
 305          * arc_wait_for_eviction() will wait until at least the
 306          * high_wmark_pages() are free (see arc_evict_state_impl()).
 307          *
 308          * Note: Even when the system is very low on memory, the kernel's
 309          * shrinker code may only ask for one "batch" of pages (512KB) to be
 310          * evicted.  If concurrent allocations consume these pages, there may
 311          * still be insufficient free pages, and the OOM killer takes action.
 312          *
 313          * By setting arc_sys_free large enough, and having
 314          * arc_wait_for_eviction() wait until there is at least arc_sys_free/2
 315          * free memory, it is much less likely that concurrent allocations can
 316          * consume all the memory that was evicted before checking for
 317          * OOM.
 318          *
 319          * It's hard to iterate the zones from a linux kernel module, which
 320          * makes it difficult to determine the watermark dynamically. Instead
 321          * we compute the maximum high watermark for this system, based
 322          * on the amount of memory, assuming default parameters on Linux kernel
 323          * 5.3.
 324          */
 325
 326         /*
 327          * Base wmark_low is 4 * the square root of Kbytes of RAM.
 328          */
 329         long wmark = 4 * int_sqrt(allmem/1024) * 1024;
 330
 331         /*
 332          * Clamp to between 128K and 64MB.
 333          */
 334         wmark = MAX(wmark, 128 * 1024);
 335         wmark = MIN(wmark, 64 * 1024 * 1024);
 336
 337         /*
 338          * watermark_boost can increase the wmark by up to 150%.
 339          */
 340         wmark += wmark * 150 / 100;
 341
 342         /*
 343          * arc_sys_free needs to be more than 2x the watermark, because
 344          * arc_wait_for_eviction() waits for half of arc_sys_free.  Bump this up
 345          * to 3x to ensure we're above it.
 346          */
 347         arc_sys_free = wmark * 3 + allmem / 32;
 348 }
 349
 350 void
 351 arc_lowmem_init(void)
 352 {
 353         uint64_t allmem = arc_all_memory();
 354
 355         /*
 356          * Register a shrinker to support synchronous (direct) memory
 357          * reclaim from the arc.  This is done to prevent kswapd from
 358          * swapping out pages when it is preferable to shrink the arc.
 359          */
 360         spl_register_shrinker(&arc_shrinker);
 361         arc_set_sys_free(allmem);
 362 }
 363
 364 void
 365 arc_lowmem_fini(void)
 366 {
 367         spl_unregister_shrinker(&arc_shrinker);
 368 }
 369
 370 int
 371 param_set_arc_u64(const char *buf, zfs_kernel_param_t *kp)
 372 {
 373         int error;
 374
 375         error = spl_param_set_u64(buf, kp);
 376         if (error < 0)
 377                 return (SET_ERROR(error));
 378
 379         arc_tuning_update(B_TRUE);
 380
 381         return (0);
 382 }
 383
 384 int
 385 param_set_arc_min(const char *buf, zfs_kernel_param_t *kp)
 386 {
 387         return (param_set_arc_u64(buf, kp));
 388 }
 389
 390 int
 391 param_set_arc_max(const char *buf, zfs_kernel_param_t *kp)
 392 {
 393         return (param_set_arc_u64(buf, kp));
 394 }
 395
 396 int
 397 param_set_arc_int(const char *buf, zfs_kernel_param_t *kp)
 398 {
 399         int error;
 400
 401         error = param_set_int(buf, kp);
 402         if (error < 0)
 403                 return (SET_ERROR(error));
 404
 405         arc_tuning_update(B_TRUE);
 406
 407         return (0);
 408 }
 409
 410 #ifdef CONFIG_MEMORY_HOTPLUG
 411 static int
 412 arc_hotplug_callback(struct notifier_block *self, unsigned long action,
 413     void *arg)
 414 {
 415         (void) self, (void) arg;
 416         uint64_t allmem = arc_all_memory();
 417         if (action != MEM_ONLINE)
 418                 return (NOTIFY_OK);
 419
 420         arc_set_limits(allmem);
 421
 422 #ifdef __LP64__
 423         if (zfs_dirty_data_max_max == 0)
 424                 zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024,
 425                     allmem * zfs_dirty_data_max_max_percent / 100);
 426 #else
 427         if (zfs_dirty_data_max_max == 0)
 428                 zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024,
 429                     allmem * zfs_dirty_data_max_max_percent / 100);
 430 #endif
 431
 432         arc_set_sys_free(allmem);
 433         return (NOTIFY_OK);
 434 }
 435 #endif
 436
 437 void
 438 arc_register_hotplug(void)
 439 {
 440 #ifdef CONFIG_MEMORY_HOTPLUG
 441         arc_hotplug_callback_mem_nb.notifier_call = arc_hotplug_callback;
 442         /* There is no significance to the value 100 */
 443         arc_hotplug_callback_mem_nb.priority = 100;
 444         register_memory_notifier(&arc_hotplug_callback_mem_nb);
 445 #endif
 446 }
 447
 448 void
 449 arc_unregister_hotplug(void)
 450 {
 451 #ifdef CONFIG_MEMORY_HOTPLUG
 452         unregister_memory_notifier(&arc_hotplug_callback_mem_nb);
 453 #endif
 454 }
 455 #else /* _KERNEL */
 456 int64_t
 457 arc_available_memory(void)
 458 {
 459         int64_t lowest = INT64_MAX;
 460
 461         /* Every 100 calls, free a small amount */
 462         if (random_in_range(100) == 0)
 463                 lowest = -1024;
 464
 465         return (lowest);
 466 }
 467
 468 int
 469 arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
 470 {
 471         (void) spa, (void) reserve, (void) txg;
 472         return (0);
 473 }
 474
 475 uint64_t
 476 arc_all_memory(void)
 477 {
 478         return (ptob(physmem) / 2);
 479 }
 480
 481 uint64_t
 482 arc_free_memory(void)
 483 {
 484         return (random_in_range(arc_all_memory() * 20 / 100));
 485 }
 486
 487 void
 488 arc_register_hotplug(void)
 489 {
 490 }
 491
 492 void
 493 arc_unregister_hotplug(void)
 494 {
 495 }
 496 #endif /* _KERNEL */
 497
 498 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_limit, INT, ZMOD_RW,
 499         "Limit on number of pages that ARC shrinker can reclaim at once");