4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2018, Joyent, Inc.
24 * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
26 * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
31 #include <sys/spa_impl.h>
32 #include <sys/zio_compress.h>
33 #include <sys/zio_checksum.h>
34 #include <sys/zfs_context.h>
36 #include <sys/refcount.h>
38 #include <sys/vdev_trim.h>
39 #include <sys/vdev_impl.h>
40 #include <sys/dsl_pool.h>
41 #include <sys/zio_checksum.h>
42 #include <sys/multilist.h>
45 #include <sys/fm/fs/zfs.h>
47 #include <sys/shrinker.h>
48 #include <sys/vmsystm.h>
50 #include <linux/page_compat.h>
52 #include <sys/callb.h>
53 #include <sys/kstat.h>
55 #include <zfs_fletcher.h>
56 #include <sys/arc_impl.h>
57 #include <sys/trace_defs.h>
58 #include <sys/aggsum.h>
60 int64_t last_free_memory
;
61 free_memory_reason_t last_free_reason
;
65 * Return maximum amount of memory that we could possibly use. Reduced
66 * to half of all memory in user space which is primarily used for testing.
72 return (ptob(zfs_totalram_pages
- zfs_totalhigh_pages
));
74 return (ptob(zfs_totalram_pages
));
75 #endif /* CONFIG_HIGHMEM */
79 * Return the amount of memory that is considered free. In user space
80 * which is primarily used for testing we pretend that free memory ranges
81 * from 0-20% of all memory.
89 return (ptob(si
.freeram
- si
.freehigh
));
91 return (ptob(nr_free_pages() +
92 nr_inactive_file_pages() +
93 nr_inactive_anon_pages() +
94 nr_slab_reclaimable_pages()));
95 #endif /* CONFIG_HIGHMEM */
99 * Additional reserve of pages for pp_reserve.
101 int64_t arc_pages_pp_reserve
= 64;
104 * Additional reserve of pages for swapfs.
106 int64_t arc_swapfs_reserve
= 64;
109 * Return the amount of memory that can be consumed before reclaim will be
110 * needed. Positive if there is sufficient free memory, negative indicates
111 * the amount of memory that needs to be freed up.
114 arc_available_memory(void)
116 int64_t lowest
= INT64_MAX
;
117 free_memory_reason_t r
= FMR_UNKNOWN
;
122 pgcnt_t needfree
= btop(arc_need_free
);
123 pgcnt_t lotsfree
= btop(arc_sys_free
);
125 pgcnt_t freemem
= btop(arc_free_memory());
128 n
= PAGESIZE
* (-needfree
);
136 * check that we're out of range of the pageout scanner. It starts to
137 * schedule paging if freemem is less than lotsfree and needfree.
138 * lotsfree is the high-water mark for pageout, and needfree is the
139 * number of needed free pages. We add extra pages here to make sure
140 * the scanner doesn't start up while we're freeing memory.
142 n
= PAGESIZE
* (freemem
- lotsfree
- needfree
- desfree
);
150 * If we're on a 32-bit platform, it's possible that we'll exhaust the
151 * kernel heap space before we ever run out of available physical
152 * memory. Most checks of the size of the heap_area compare against
153 * tune.t_minarmem, which is the minimum available real memory that we
154 * can have in the system. However, this is generally fixed at 25 pages
155 * which is so low that it's useless. In this comparison, we seek to
156 * calculate the total heap-size, and reclaim if more than 3/4ths of the
157 * heap is allocated. (Or, in the calculation, if less than 1/4th is
160 n
= vmem_size(heap_arena
, VMEM_FREE
) -
161 (vmem_size(heap_arena
, VMEM_FREE
| VMEM_ALLOC
) >> 2);
169 * If zio data pages are being allocated out of a separate heap segment,
170 * then enforce that the size of available vmem for this arena remains
171 * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free.
173 * Note that reducing the arc_zio_arena_free_shift keeps more virtual
174 * memory (in the zio_arena) free, which can avoid memory
175 * fragmentation issues.
177 if (zio_arena
!= NULL
) {
178 n
= (int64_t)vmem_size(zio_arena
, VMEM_FREE
) -
179 (vmem_size(zio_arena
, VMEM_ALLOC
) >>
180 arc_zio_arena_free_shift
);
187 last_free_memory
= lowest
;
188 last_free_reason
= r
;
194 arc_evictable_memory(void)
196 int64_t asize
= aggsum_value(&arc_size
);
198 zfs_refcount_count(&arc_mru
->arcs_esize
[ARC_BUFC_DATA
]) +
199 zfs_refcount_count(&arc_mru
->arcs_esize
[ARC_BUFC_METADATA
]) +
200 zfs_refcount_count(&arc_mfu
->arcs_esize
[ARC_BUFC_DATA
]) +
201 zfs_refcount_count(&arc_mfu
->arcs_esize
[ARC_BUFC_METADATA
]);
202 uint64_t arc_dirty
= MAX((int64_t)asize
- (int64_t)arc_clean
, 0);
205 * Scale reported evictable memory in proportion to page cache, cap
206 * at specified min/max.
208 uint64_t min
= (ptob(nr_file_pages()) / 100) * zfs_arc_pc_percent
;
209 min
= MAX(arc_c_min
, MIN(arc_c_max
, min
));
211 if (arc_dirty
>= min
)
214 return (MAX((int64_t)asize
- (int64_t)min
, 0));
218 * If sc->nr_to_scan is zero, the caller is requesting a query of the
219 * number of objects which can potentially be freed. If it is nonzero,
220 * the request is to free that many objects.
222 * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks
223 * in struct shrinker and also require the shrinker to return the number
226 * Older kernels require the shrinker to return the number of freeable
227 * objects following the freeing of nr_to_free.
229 static spl_shrinker_t
230 __arc_shrinker_func(struct shrinker
*shrink
, struct shrink_control
*sc
)
234 /* The arc is considered warm once reclaim has occurred */
235 if (unlikely(arc_warm
== B_FALSE
))
238 /* Return the potential number of reclaimable pages */
239 pages
= btop((int64_t)arc_evictable_memory());
240 if (sc
->nr_to_scan
== 0)
243 /* Not allowed to perform filesystem reclaim */
244 if (!(sc
->gfp_mask
& __GFP_FS
))
245 return (SHRINK_STOP
);
247 /* Reclaim in progress */
248 if (mutex_tryenter(&arc_adjust_lock
) == 0) {
249 ARCSTAT_INCR(arcstat_need_free
, ptob(sc
->nr_to_scan
));
253 mutex_exit(&arc_adjust_lock
);
256 * Evict the requested number of pages by shrinking arc_c the
260 arc_reduce_target_size(ptob(sc
->nr_to_scan
));
261 if (current_is_kswapd())
262 arc_kmem_reap_soon();
263 #ifdef HAVE_SPLIT_SHRINKER_CALLBACK
264 pages
= MAX((int64_t)pages
-
265 (int64_t)btop(arc_evictable_memory()), 0);
267 pages
= btop(arc_evictable_memory());
270 * We've shrunk what we can, wake up threads.
272 cv_broadcast(&arc_adjust_waiters_cv
);
277 * When direct reclaim is observed it usually indicates a rapid
278 * increase in memory pressure. This occurs because the kswapd
279 * threads were unable to asynchronously keep enough free memory
280 * available. In this case set arc_no_grow to briefly pause arc
281 * growth to avoid compounding the memory pressure.
283 if (current_is_kswapd()) {
284 ARCSTAT_BUMP(arcstat_memory_indirect_count
);
286 arc_no_grow
= B_TRUE
;
287 arc_kmem_reap_soon();
288 ARCSTAT_BUMP(arcstat_memory_direct_count
);
293 SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func
);
295 SPL_SHRINKER_DECLARE(arc_shrinker
, arc_shrinker_func
, DEFAULT_SEEKS
);
298 arc_memory_throttle(spa_t
*spa
, uint64_t reserve
, uint64_t txg
)
300 uint64_t available_memory
= arc_free_memory();
304 MIN(available_memory
, vmem_size(heap_arena
, VMEM_FREE
));
307 if (available_memory
> arc_all_memory() * arc_lotsfree_percent
/ 100)
310 if (txg
> spa
->spa_lowmem_last_txg
) {
311 spa
->spa_lowmem_last_txg
= txg
;
312 spa
->spa_lowmem_page_load
= 0;
315 * If we are in pageout, we know that memory is already tight,
316 * the arc is already going to be evicting, so we just want to
317 * continue to let page writes occur as quickly as possible.
319 if (current_is_kswapd()) {
320 if (spa
->spa_lowmem_page_load
>
321 MAX(arc_sys_free
/ 4, available_memory
) / 4) {
322 DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim
);
323 return (SET_ERROR(ERESTART
));
325 /* Note: reserve is inflated, so we deflate */
326 atomic_add_64(&spa
->spa_lowmem_page_load
, reserve
/ 8);
328 } else if (spa
->spa_lowmem_page_load
> 0 && arc_reclaim_needed()) {
329 /* memory is low, delay before restarting */
330 ARCSTAT_INCR(arcstat_memory_throttle_count
, 1);
331 DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim
);
332 return (SET_ERROR(EAGAIN
));
334 spa
->spa_lowmem_page_load
= 0;
339 arc_lowmem_init(void)
341 uint64_t allmem
= arc_all_memory();
344 * Register a shrinker to support synchronous (direct) memory
345 * reclaim from the arc. This is done to prevent kswapd from
346 * swapping out pages when it is preferable to shrink the arc.
348 spl_register_shrinker(&arc_shrinker
);
350 /* Set to 1/64 of all memory or a minimum of 512K */
351 arc_sys_free
= MAX(allmem
/ 64, (512 * 1024));
356 arc_lowmem_fini(void)
358 spl_unregister_shrinker(&arc_shrinker
);
362 param_set_arc_long(const char *buf
, zfs_kernel_param_t
*kp
)
366 error
= param_set_long(buf
, kp
);
368 return (SET_ERROR(error
));
376 param_set_arc_int(const char *buf
, zfs_kernel_param_t
*kp
)
380 error
= param_set_int(buf
, kp
);
382 return (SET_ERROR(error
));
390 arc_available_memory(void)
392 int64_t lowest
= INT64_MAX
;
393 free_memory_reason_t r
= FMR_UNKNOWN
;
395 /* Every 100 calls, free a small amount */
396 if (spa_get_random(100) == 0)
399 last_free_memory
= lowest
;
400 last_free_reason
= r
;
406 arc_memory_throttle(spa_t
*spa
, uint64_t reserve
, uint64_t txg
)
414 return (ptob(physmem
) / 2);
418 arc_free_memory(void)
420 return (spa_get_random(arc_all_memory() * 20 / 100));
425 * Helper function for arc_prune_async() it is responsible for safely
426 * handling the execution of a registered arc_prune_func_t.
429 arc_prune_task(void *ptr
)
431 arc_prune_t
*ap
= (arc_prune_t
*)ptr
;
432 arc_prune_func_t
*func
= ap
->p_pfunc
;
435 func(ap
->p_adjust
, ap
->p_private
);
437 zfs_refcount_remove(&ap
->p_refcnt
, func
);
441 * Notify registered consumers they must drop holds on a portion of the ARC
442 * buffered they reference. This provides a mechanism to ensure the ARC can
443 * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This
444 * is analogous to dnlc_reduce_cache() but more generic.
446 * This operation is performed asynchronously so it may be safely called
447 * in the context of the arc_reclaim_thread(). A reference is taken here
448 * for each registered arc_prune_t and the arc_prune_task() is responsible
449 * for releasing it once the registered arc_prune_func_t has completed.
452 arc_prune_async(int64_t adjust
)
456 mutex_enter(&arc_prune_mtx
);
457 for (ap
= list_head(&arc_prune_list
); ap
!= NULL
;
458 ap
= list_next(&arc_prune_list
, ap
)) {
460 if (zfs_refcount_count(&ap
->p_refcnt
) >= 2)
463 zfs_refcount_add(&ap
->p_refcnt
, ap
->p_pfunc
);
464 ap
->p_adjust
= adjust
;
465 if (taskq_dispatch(arc_prune_taskq
, arc_prune_task
,
466 ap
, TQ_SLEEP
) == TASKQID_INVALID
) {
467 zfs_refcount_remove(&ap
->p_refcnt
, ap
->p_pfunc
);
470 ARCSTAT_BUMP(arcstat_prune
);
472 mutex_exit(&arc_prune_mtx
);