]> git.proxmox.com Git - mirror_zfs.git/blob - man/man5/zfs-module-parameters.5
75fd5808b30e4843733909277a01fa5a5dc4cf6d
[mirror_zfs.git] / man / man5 / zfs-module-parameters.5
1 '\" te
2 .\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
3 .\" Copyright (c) 2017 Datto Inc.
4 .\" The contents of this file are subject to the terms of the Common Development
5 .\" and Distribution License (the "License"). You may not use this file except
6 .\" in compliance with the License. You can obtain a copy of the license at
7 .\" usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.
8 .\"
9 .\" See the License for the specific language governing permissions and
10 .\" limitations under the License. When distributing Covered Code, include this
11 .\" CDDL HEADER in each file and include the License file at
12 .\" usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this
13 .\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your
14 .\" own identifying information:
15 .\" Portions Copyright [yyyy] [name of copyright owner]
16 .TH ZFS-MODULE-PARAMETERS 5 "Oct 28, 2017"
17 .SH NAME
18 zfs\-module\-parameters \- ZFS module parameters
19 .SH DESCRIPTION
20 .sp
21 .LP
22 Description of the different parameters to the ZFS module.
23
24 .SS "Module parameters"
25 .sp
26 .LP
27
28 .sp
29 .ne 2
30 .na
31 \fBdbuf_cache_max_bytes\fR (ulong)
32 .ad
33 .RS 12n
34 Maximum size in bytes of the dbuf cache. When \fB0\fR this value will default
35 to \fB1/2^dbuf_cache_shift\fR (1/32) of the target ARC size, otherwise the
36 provided value in bytes will be used. The behavior of the dbuf cache and its
37 associated settings can be observed via the \fB/proc/spl/kstat/zfs/dbufstats\fR
38 kstat.
39 .sp
40 Default value: \fB0\fR.
41 .RE
42
43 .sp
44 .ne 2
45 .na
46 \fBdbuf_cache_hiwater_pct\fR (uint)
47 .ad
48 .RS 12n
49 The percentage over \fBdbuf_cache_max_bytes\fR when dbufs must be evicted
50 directly.
51 .sp
52 Default value: \fB10\fR%.
53 .RE
54
55 .sp
56 .ne 2
57 .na
58 \fBdbuf_cache_lowater_pct\fR (uint)
59 .ad
60 .RS 12n
61 The percentage below \fBdbuf_cache_max_bytes\fR when the evict thread stops
62 evicting dbufs.
63 .sp
64 Default value: \fB10\fR%.
65 .RE
66
67 .sp
68 .ne 2
69 .na
70 \fBdbuf_cache_shift\fR (int)
71 .ad
72 .RS 12n
73 Set the size of the dbuf cache, \fBdbuf_cache_max_bytes\fR, to a log2 fraction
74 of the target arc size.
75 .sp
76 Default value: \fB5\fR.
77 .RE
78
79 .sp
80 .ne 2
81 .na
82 \fBignore_hole_birth\fR (int)
83 .ad
84 .RS 12n
85 When set, the hole_birth optimization will not be used, and all holes will
86 always be sent on zfs send. Useful if you suspect your datasets are affected
87 by a bug in hole_birth.
88 .sp
89 Use \fB1\fR for on (default) and \fB0\fR for off.
90 .RE
91
92 .sp
93 .ne 2
94 .na
95 \fBl2arc_feed_again\fR (int)
96 .ad
97 .RS 12n
98 Turbo L2ARC warm-up. When the L2ARC is cold the fill interval will be set as
99 fast as possible.
100 .sp
101 Use \fB1\fR for yes (default) and \fB0\fR to disable.
102 .RE
103
104 .sp
105 .ne 2
106 .na
107 \fBl2arc_feed_min_ms\fR (ulong)
108 .ad
109 .RS 12n
110 Min feed interval in milliseconds. Requires \fBl2arc_feed_again=1\fR and only
111 applicable in related situations.
112 .sp
113 Default value: \fB200\fR.
114 .RE
115
116 .sp
117 .ne 2
118 .na
119 \fBl2arc_feed_secs\fR (ulong)
120 .ad
121 .RS 12n
122 Seconds between L2ARC writing
123 .sp
124 Default value: \fB1\fR.
125 .RE
126
127 .sp
128 .ne 2
129 .na
130 \fBl2arc_headroom\fR (ulong)
131 .ad
132 .RS 12n
133 How far through the ARC lists to search for L2ARC cacheable content, expressed
134 as a multiplier of \fBl2arc_write_max\fR
135 .sp
136 Default value: \fB2\fR.
137 .RE
138
139 .sp
140 .ne 2
141 .na
142 \fBl2arc_headroom_boost\fR (ulong)
143 .ad
144 .RS 12n
145 Scales \fBl2arc_headroom\fR by this percentage when L2ARC contents are being
146 successfully compressed before writing. A value of 100 disables this feature.
147 .sp
148 Default value: \fB200\fR%.
149 .RE
150
151 .sp
152 .ne 2
153 .na
154 \fBl2arc_noprefetch\fR (int)
155 .ad
156 .RS 12n
157 Do not write buffers to L2ARC if they were prefetched but not used by
158 applications
159 .sp
160 Use \fB1\fR for yes (default) and \fB0\fR to disable.
161 .RE
162
163 .sp
164 .ne 2
165 .na
166 \fBl2arc_norw\fR (int)
167 .ad
168 .RS 12n
169 No reads during writes
170 .sp
171 Use \fB1\fR for yes and \fB0\fR for no (default).
172 .RE
173
174 .sp
175 .ne 2
176 .na
177 \fBl2arc_write_boost\fR (ulong)
178 .ad
179 .RS 12n
180 Cold L2ARC devices will have \fBl2arc_write_max\fR increased by this amount
181 while they remain cold.
182 .sp
183 Default value: \fB8,388,608\fR.
184 .RE
185
186 .sp
187 .ne 2
188 .na
189 \fBl2arc_write_max\fR (ulong)
190 .ad
191 .RS 12n
192 Max write bytes per interval
193 .sp
194 Default value: \fB8,388,608\fR.
195 .RE
196
197 .sp
198 .ne 2
199 .na
200 \fBmetaslab_aliquot\fR (ulong)
201 .ad
202 .RS 12n
203 Metaslab granularity, in bytes. This is roughly similar to what would be
204 referred to as the "stripe size" in traditional RAID arrays. In normal
205 operation, ZFS will try to write this amount of data to a top-level vdev
206 before moving on to the next one.
207 .sp
208 Default value: \fB524,288\fR.
209 .RE
210
211 .sp
212 .ne 2
213 .na
214 \fBmetaslab_bias_enabled\fR (int)
215 .ad
216 .RS 12n
217 Enable metaslab group biasing based on its vdev's over- or under-utilization
218 relative to the pool.
219 .sp
220 Use \fB1\fR for yes (default) and \fB0\fR for no.
221 .RE
222
223 .sp
224 .ne 2
225 .na
226 \fBmetaslab_force_ganging\fR (ulong)
227 .ad
228 .RS 12n
229 Make some blocks above a certain size be gang blocks. This option is used
230 by the test suite to facilitate testing.
231 .sp
232 Default value: \fB16,777,217\fR.
233 .RE
234
235 .sp
236 .ne 2
237 .na
238 \fBzfs_metaslab_segment_weight_enabled\fR (int)
239 .ad
240 .RS 12n
241 Enable/disable segment-based metaslab selection.
242 .sp
243 Use \fB1\fR for yes (default) and \fB0\fR for no.
244 .RE
245
246 .sp
247 .ne 2
248 .na
249 \fBzfs_metaslab_switch_threshold\fR (int)
250 .ad
251 .RS 12n
252 When using segment-based metaslab selection, continue allocating
253 from the active metaslab until \fBzfs_metaslab_switch_threshold\fR
254 worth of buckets have been exhausted.
255 .sp
256 Default value: \fB2\fR.
257 .RE
258
259 .sp
260 .ne 2
261 .na
262 \fBmetaslab_debug_load\fR (int)
263 .ad
264 .RS 12n
265 Load all metaslabs during pool import.
266 .sp
267 Use \fB1\fR for yes and \fB0\fR for no (default).
268 .RE
269
270 .sp
271 .ne 2
272 .na
273 \fBmetaslab_debug_unload\fR (int)
274 .ad
275 .RS 12n
276 Prevent metaslabs from being unloaded.
277 .sp
278 Use \fB1\fR for yes and \fB0\fR for no (default).
279 .RE
280
281 .sp
282 .ne 2
283 .na
284 \fBmetaslab_fragmentation_factor_enabled\fR (int)
285 .ad
286 .RS 12n
287 Enable use of the fragmentation metric in computing metaslab weights.
288 .sp
289 Use \fB1\fR for yes (default) and \fB0\fR for no.
290 .RE
291
292 .sp
293 .ne 2
294 .na
295 \fBmetaslabs_per_vdev\fR (int)
296 .ad
297 .RS 12n
298 When a vdev is added, it will be divided into approximately (but no more than) this number of metaslabs.
299 .sp
300 Default value: \fB200\fR.
301 .RE
302
303 .sp
304 .ne 2
305 .na
306 \fBmetaslab_preload_enabled\fR (int)
307 .ad
308 .RS 12n
309 Enable metaslab group preloading.
310 .sp
311 Use \fB1\fR for yes (default) and \fB0\fR for no.
312 .RE
313
314 .sp
315 .ne 2
316 .na
317 \fBmetaslab_lba_weighting_enabled\fR (int)
318 .ad
319 .RS 12n
320 Give more weight to metaslabs with lower LBAs, assuming they have
321 greater bandwidth as is typically the case on a modern constant
322 angular velocity disk drive.
323 .sp
324 Use \fB1\fR for yes (default) and \fB0\fR for no.
325 .RE
326
327 .sp
328 .ne 2
329 .na
330 \fBspa_config_path\fR (charp)
331 .ad
332 .RS 12n
333 SPA config file
334 .sp
335 Default value: \fB/etc/zfs/zpool.cache\fR.
336 .RE
337
338 .sp
339 .ne 2
340 .na
341 \fBspa_asize_inflation\fR (int)
342 .ad
343 .RS 12n
344 Multiplication factor used to estimate actual disk consumption from the
345 size of data being written. The default value is a worst case estimate,
346 but lower values may be valid for a given pool depending on its
347 configuration. Pool administrators who understand the factors involved
348 may wish to specify a more realistic inflation factor, particularly if
349 they operate close to quota or capacity limits.
350 .sp
351 Default value: \fB24\fR.
352 .RE
353
354 .sp
355 .ne 2
356 .na
357 \fBspa_load_verify_data\fR (int)
358 .ad
359 .RS 12n
360 Whether to traverse data blocks during an "extreme rewind" (\fB-X\fR)
361 import. Use 0 to disable and 1 to enable.
362
363 An extreme rewind import normally performs a full traversal of all
364 blocks in the pool for verification. If this parameter is set to 0,
365 the traversal skips non-metadata blocks. It can be toggled once the
366 import has started to stop or start the traversal of non-metadata blocks.
367 .sp
368 Default value: \fB1\fR.
369 .RE
370
371 .sp
372 .ne 2
373 .na
374 \fBspa_load_verify_metadata\fR (int)
375 .ad
376 .RS 12n
377 Whether to traverse blocks during an "extreme rewind" (\fB-X\fR)
378 pool import. Use 0 to disable and 1 to enable.
379
380 An extreme rewind import normally performs a full traversal of all
381 blocks in the pool for verification. If this parameter is set to 0,
382 the traversal is not performed. It can be toggled once the import has
383 started to stop or start the traversal.
384 .sp
385 Default value: \fB1\fR.
386 .RE
387
388 .sp
389 .ne 2
390 .na
391 \fBspa_load_verify_maxinflight\fR (int)
392 .ad
393 .RS 12n
394 Maximum concurrent I/Os during the traversal performed during an "extreme
395 rewind" (\fB-X\fR) pool import.
396 .sp
397 Default value: \fB10000\fR.
398 .RE
399
400 .sp
401 .ne 2
402 .na
403 \fBspa_slop_shift\fR (int)
404 .ad
405 .RS 12n
406 Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space
407 in the pool to be consumed. This ensures that we don't run the pool
408 completely out of space, due to unaccounted changes (e.g. to the MOS).
409 It also limits the worst-case time to allocate space. If we have
410 less than this amount of free space, most ZPL operations (e.g. write,
411 create) will return ENOSPC.
412 .sp
413 Default value: \fB5\fR.
414 .RE
415
416 .sp
417 .ne 2
418 .na
419 \fBzfetch_array_rd_sz\fR (ulong)
420 .ad
421 .RS 12n
422 If prefetching is enabled, disable prefetching for reads larger than this size.
423 .sp
424 Default value: \fB1,048,576\fR.
425 .RE
426
427 .sp
428 .ne 2
429 .na
430 \fBzfetch_max_distance\fR (uint)
431 .ad
432 .RS 12n
433 Max bytes to prefetch per stream (default 8MB).
434 .sp
435 Default value: \fB8,388,608\fR.
436 .RE
437
438 .sp
439 .ne 2
440 .na
441 \fBzfetch_max_streams\fR (uint)
442 .ad
443 .RS 12n
444 Max number of streams per zfetch (prefetch streams per file).
445 .sp
446 Default value: \fB8\fR.
447 .RE
448
449 .sp
450 .ne 2
451 .na
452 \fBzfetch_min_sec_reap\fR (uint)
453 .ad
454 .RS 12n
455 Min time before an active prefetch stream can be reclaimed
456 .sp
457 Default value: \fB2\fR.
458 .RE
459
460 .sp
461 .ne 2
462 .na
463 \fBzfs_arc_dnode_limit\fR (ulong)
464 .ad
465 .RS 12n
466 When the number of bytes consumed by dnodes in the ARC exceeds this number of
467 bytes, try to unpin some of it in response to demand for non-metadata. This
468 value acts as a ceiling to the amount of dnode metadata, and defaults to 0 which
469 indicates that a percent which is based on \fBzfs_arc_dnode_limit_percent\fR of
470 the ARC meta buffers that may be used for dnodes.
471
472 See also \fBzfs_arc_meta_prune\fR which serves a similar purpose but is used
473 when the amount of metadata in the ARC exceeds \fBzfs_arc_meta_limit\fR rather
474 than in response to overall demand for non-metadata.
475
476 .sp
477 Default value: \fB0\fR.
478 .RE
479
480 .sp
481 .ne 2
482 .na
483 \fBzfs_arc_dnode_limit_percent\fR (ulong)
484 .ad
485 .RS 12n
486 Percentage that can be consumed by dnodes of ARC meta buffers.
487 .sp
488 See also \fBzfs_arc_dnode_limit\fR which serves a similar purpose but has a
489 higher priority if set to nonzero value.
490 .sp
491 Default value: \fB10\fR%.
492 .RE
493
494 .sp
495 .ne 2
496 .na
497 \fBzfs_arc_dnode_reduce_percent\fR (ulong)
498 .ad
499 .RS 12n
500 Percentage of ARC dnodes to try to scan in response to demand for non-metadata
501 when the number of bytes consumed by dnodes exceeds \fBzfs_arc_dnode_limit\fR.
502
503 .sp
504 Default value: \fB10\fR% of the number of dnodes in the ARC.
505 .RE
506
507 .sp
508 .ne 2
509 .na
510 \fBzfs_arc_average_blocksize\fR (int)
511 .ad
512 .RS 12n
513 The ARC's buffer hash table is sized based on the assumption of an average
514 block size of \fBzfs_arc_average_blocksize\fR (default 8K). This works out
515 to roughly 1MB of hash table per 1GB of physical memory with 8-byte pointers.
516 For configurations with a known larger average block size this value can be
517 increased to reduce the memory footprint.
518
519 .sp
520 Default value: \fB8192\fR.
521 .RE
522
523 .sp
524 .ne 2
525 .na
526 \fBzfs_arc_evict_batch_limit\fR (int)
527 .ad
528 .RS 12n
529 Number ARC headers to evict per sub-list before proceeding to another sub-list.
530 This batch-style operation prevents entire sub-lists from being evicted at once
531 but comes at a cost of additional unlocking and locking.
532 .sp
533 Default value: \fB10\fR.
534 .RE
535
536 .sp
537 .ne 2
538 .na
539 \fBzfs_arc_grow_retry\fR (int)
540 .ad
541 .RS 12n
542 If set to a non zero value, it will replace the arc_grow_retry value with this value.
543 The arc_grow_retry value (default 5) is the number of seconds the ARC will wait before
544 trying to resume growth after a memory pressure event.
545 .sp
546 Default value: \fB0\fR.
547 .RE
548
549 .sp
550 .ne 2
551 .na
552 \fBzfs_arc_lotsfree_percent\fR (int)
553 .ad
554 .RS 12n
555 Throttle I/O when free system memory drops below this percentage of total
556 system memory. Setting this value to 0 will disable the throttle.
557 .sp
558 Default value: \fB10\fR%.
559 .RE
560
561 .sp
562 .ne 2
563 .na
564 \fBzfs_arc_max\fR (ulong)
565 .ad
566 .RS 12n
567 Max arc size of ARC in bytes. If set to 0 then it will consume 1/2 of system
568 RAM. This value must be at least 67108864 (64 megabytes).
569 .sp
570 This value can be changed dynamically with some caveats. It cannot be set back
571 to 0 while running and reducing it below the current ARC size will not cause
572 the ARC to shrink without memory pressure to induce shrinking.
573 .sp
574 Default value: \fB0\fR.
575 .RE
576
577 .sp
578 .ne 2
579 .na
580 \fBzfs_arc_meta_adjust_restarts\fR (ulong)
581 .ad
582 .RS 12n
583 The number of restart passes to make while scanning the ARC attempting
584 the free buffers in order to stay below the \fBzfs_arc_meta_limit\fR.
585 This value should not need to be tuned but is available to facilitate
586 performance analysis.
587 .sp
588 Default value: \fB4096\fR.
589 .RE
590
591 .sp
592 .ne 2
593 .na
594 \fBzfs_arc_meta_limit\fR (ulong)
595 .ad
596 .RS 12n
597 The maximum allowed size in bytes that meta data buffers are allowed to
598 consume in the ARC. When this limit is reached meta data buffers will
599 be reclaimed even if the overall arc_c_max has not been reached. This
600 value defaults to 0 which indicates that a percent which is based on
601 \fBzfs_arc_meta_limit_percent\fR of the ARC may be used for meta data.
602 .sp
603 This value my be changed dynamically except that it cannot be set back to 0
604 for a specific percent of the ARC; it must be set to an explicit value.
605 .sp
606 Default value: \fB0\fR.
607 .RE
608
609 .sp
610 .ne 2
611 .na
612 \fBzfs_arc_meta_limit_percent\fR (ulong)
613 .ad
614 .RS 12n
615 Percentage of ARC buffers that can be used for meta data.
616
617 See also \fBzfs_arc_meta_limit\fR which serves a similar purpose but has a
618 higher priority if set to nonzero value.
619
620 .sp
621 Default value: \fB75\fR%.
622 .RE
623
624 .sp
625 .ne 2
626 .na
627 \fBzfs_arc_meta_min\fR (ulong)
628 .ad
629 .RS 12n
630 The minimum allowed size in bytes that meta data buffers may consume in
631 the ARC. This value defaults to 0 which disables a floor on the amount
632 of the ARC devoted meta data.
633 .sp
634 Default value: \fB0\fR.
635 .RE
636
637 .sp
638 .ne 2
639 .na
640 \fBzfs_arc_meta_prune\fR (int)
641 .ad
642 .RS 12n
643 The number of dentries and inodes to be scanned looking for entries
644 which can be dropped. This may be required when the ARC reaches the
645 \fBzfs_arc_meta_limit\fR because dentries and inodes can pin buffers
646 in the ARC. Increasing this value will cause to dentry and inode caches
647 to be pruned more aggressively. Setting this value to 0 will disable
648 pruning the inode and dentry caches.
649 .sp
650 Default value: \fB10,000\fR.
651 .RE
652
653 .sp
654 .ne 2
655 .na
656 \fBzfs_arc_meta_strategy\fR (int)
657 .ad
658 .RS 12n
659 Define the strategy for ARC meta data buffer eviction (meta reclaim strategy).
660 A value of 0 (META_ONLY) will evict only the ARC meta data buffers.
661 A value of 1 (BALANCED) indicates that additional data buffers may be evicted if
662 that is required to in order to evict the required number of meta data buffers.
663 .sp
664 Default value: \fB1\fR.
665 .RE
666
667 .sp
668 .ne 2
669 .na
670 \fBzfs_arc_min\fR (ulong)
671 .ad
672 .RS 12n
673 Min arc size of ARC in bytes. If set to 0 then arc_c_min will default to
674 consuming the larger of 32M or 1/32 of total system memory.
675 .sp
676 Default value: \fB0\fR.
677 .RE
678
679 .sp
680 .ne 2
681 .na
682 \fBzfs_arc_min_prefetch_ms\fR (int)
683 .ad
684 .RS 12n
685 Minimum time prefetched blocks are locked in the ARC, specified in ms.
686 A value of \fB0\fR will default to 1000 ms.
687 .sp
688 Default value: \fB0\fR.
689 .RE
690
691 .sp
692 .ne 2
693 .na
694 \fBzfs_arc_min_prescient_prefetch_ms\fR (int)
695 .ad
696 .RS 12n
697 Minimum time "prescient prefetched" blocks are locked in the ARC, specified
698 in ms. These blocks are meant to be prefetched fairly aggresively ahead of
699 the code that may use them. A value of \fB0\fR will default to 6000 ms.
700 .sp
701 Default value: \fB0\fR.
702 .RE
703
704 .sp
705 .ne 2
706 .na
707 \fBzfs_multilist_num_sublists\fR (int)
708 .ad
709 .RS 12n
710 To allow more fine-grained locking, each ARC state contains a series
711 of lists for both data and meta data objects. Locking is performed at
712 the level of these "sub-lists". This parameters controls the number of
713 sub-lists per ARC state, and also applies to other uses of the
714 multilist data structure.
715 .sp
716 Default value: \fB4\fR or the number of online CPUs, whichever is greater
717 .RE
718
719 .sp
720 .ne 2
721 .na
722 \fBzfs_arc_overflow_shift\fR (int)
723 .ad
724 .RS 12n
725 The ARC size is considered to be overflowing if it exceeds the current
726 ARC target size (arc_c) by a threshold determined by this parameter.
727 The threshold is calculated as a fraction of arc_c using the formula
728 "arc_c >> \fBzfs_arc_overflow_shift\fR".
729
730 The default value of 8 causes the ARC to be considered to be overflowing
731 if it exceeds the target size by 1/256th (0.3%) of the target size.
732
733 When the ARC is overflowing, new buffer allocations are stalled until
734 the reclaim thread catches up and the overflow condition no longer exists.
735 .sp
736 Default value: \fB8\fR.
737 .RE
738
739 .sp
740 .ne 2
741 .na
742
743 \fBzfs_arc_p_min_shift\fR (int)
744 .ad
745 .RS 12n
746 If set to a non zero value, this will update arc_p_min_shift (default 4)
747 with the new value.
748 arc_p_min_shift is used to shift of arc_c for calculating both min and max
749 max arc_p
750 .sp
751 Default value: \fB0\fR.
752 .RE
753
754 .sp
755 .ne 2
756 .na
757 \fBzfs_arc_p_dampener_disable\fR (int)
758 .ad
759 .RS 12n
760 Disable arc_p adapt dampener
761 .sp
762 Use \fB1\fR for yes (default) and \fB0\fR to disable.
763 .RE
764
765 .sp
766 .ne 2
767 .na
768 \fBzfs_arc_shrink_shift\fR (int)
769 .ad
770 .RS 12n
771 If set to a non zero value, this will update arc_shrink_shift (default 7)
772 with the new value.
773 .sp
774 Default value: \fB0\fR.
775 .RE
776
777 .sp
778 .ne 2
779 .na
780 \fBzfs_arc_pc_percent\fR (uint)
781 .ad
782 .RS 12n
783 Percent of pagecache to reclaim arc to
784
785 This tunable allows ZFS arc to play more nicely with the kernel's LRU
786 pagecache. It can guarantee that the arc size won't collapse under scanning
787 pressure on the pagecache, yet still allows arc to be reclaimed down to
788 zfs_arc_min if necessary. This value is specified as percent of pagecache
789 size (as measured by NR_FILE_PAGES) where that percent may exceed 100. This
790 only operates during memory pressure/reclaim.
791 .sp
792 Default value: \fB0\fR% (disabled).
793 .RE
794
795 .sp
796 .ne 2
797 .na
798 \fBzfs_arc_sys_free\fR (ulong)
799 .ad
800 .RS 12n
801 The target number of bytes the ARC should leave as free memory on the system.
802 Defaults to the larger of 1/64 of physical memory or 512K. Setting this
803 option to a non-zero value will override the default.
804 .sp
805 Default value: \fB0\fR.
806 .RE
807
808 .sp
809 .ne 2
810 .na
811 \fBzfs_autoimport_disable\fR (int)
812 .ad
813 .RS 12n
814 Disable pool import at module load by ignoring the cache file (typically \fB/etc/zfs/zpool.cache\fR).
815 .sp
816 Use \fB1\fR for yes (default) and \fB0\fR for no.
817 .RE
818
819 .sp
820 .ne 2
821 .na
822 \fBzfs_checksums_per_second\fR (int)
823 .ad
824 .RS 12n
825 Rate limit checksum events to this many per second. Note that this should
826 not be set below the zed thresholds (currently 10 checksums over 10 sec)
827 or else zed may not trigger any action.
828 .sp
829 Default value: 20
830 .RE
831
832 .sp
833 .ne 2
834 .na
835 \fBzfs_commit_timeout_pct\fR (int)
836 .ad
837 .RS 12n
838 This controls the amount of time that a ZIL block (lwb) will remain "open"
839 when it isn't "full", and it has a thread waiting for it to be committed to
840 stable storage. The timeout is scaled based on a percentage of the last lwb
841 latency to avoid significantly impacting the latency of each individual
842 transaction record (itx).
843 .sp
844 Default value: \fB5\fR%.
845 .RE
846
847 .sp
848 .ne 2
849 .na
850 \fBzfs_dbgmsg_enable\fR (int)
851 .ad
852 .RS 12n
853 Internally ZFS keeps a small log to facilitate debugging. By default the log
854 is disabled, to enable it set this option to 1. The contents of the log can
855 be accessed by reading the /proc/spl/kstat/zfs/dbgmsg file. Writing 0 to
856 this proc file clears the log.
857 .sp
858 Default value: \fB0\fR.
859 .RE
860
861 .sp
862 .ne 2
863 .na
864 \fBzfs_dbgmsg_maxsize\fR (int)
865 .ad
866 .RS 12n
867 The maximum size in bytes of the internal ZFS debug log.
868 .sp
869 Default value: \fB4M\fR.
870 .RE
871
872 .sp
873 .ne 2
874 .na
875 \fBzfs_dbuf_state_index\fR (int)
876 .ad
877 .RS 12n
878 This feature is currently unused. It is normally used for controlling what
879 reporting is available under /proc/spl/kstat/zfs.
880 .sp
881 Default value: \fB0\fR.
882 .RE
883
884 .sp
885 .ne 2
886 .na
887 \fBzfs_deadman_enabled\fR (int)
888 .ad
889 .RS 12n
890 When a pool sync operation takes longer than \fBzfs_deadman_synctime_ms\fR
891 milliseconds, or when an individual I/O takes longer than
892 \fBzfs_deadman_ziotime_ms\fR milliseconds, then the operation is considered to
893 be "hung". If \fBzfs_deadman_enabled\fR is set then the deadman behavior is
894 invoked as described by the \fBzfs_deadman_failmode\fR module option.
895 By default the deadman is enabled and configured to \fBwait\fR which results
896 in "hung" I/Os only being logged. The deadman is automatically disabled
897 when a pool gets suspended.
898 .sp
899 Default value: \fB1\fR.
900 .RE
901
902 .sp
903 .ne 2
904 .na
905 \fBzfs_deadman_failmode\fR (charp)
906 .ad
907 .RS 12n
908 Controls the failure behavior when the deadman detects a "hung" I/O. Valid
909 values are \fBwait\fR, \fBcontinue\fR, and \fBpanic\fR.
910 .sp
911 \fBwait\fR - Wait for a "hung" I/O to complete. For each "hung" I/O a
912 "deadman" event will be posted describing that I/O.
913 .sp
914 \fBcontinue\fR - Attempt to recover from a "hung" I/O by re-dispatching it
915 to the I/O pipeline if possible.
916 .sp
917 \fBpanic\fR - Panic the system. This can be used to facilitate an automatic
918 fail-over to a properly configured fail-over partner.
919 .sp
920 Default value: \fBwait\fR.
921 .RE
922
923 .sp
924 .ne 2
925 .na
926 \fBzfs_deadman_checktime_ms\fR (int)
927 .ad
928 .RS 12n
929 Check time in milliseconds. This defines the frequency at which we check
930 for hung I/O and potentially invoke the \fBzfs_deadman_failmode\fR behavior.
931 .sp
932 Default value: \fB60,000\fR.
933 .RE
934
935 .sp
936 .ne 2
937 .na
938 \fBzfs_deadman_synctime_ms\fR (ulong)
939 .ad
940 .RS 12n
941 Interval in milliseconds after which the deadman is triggered and also
942 the interval after which a pool sync operation is considered to be "hung".
943 Once this limit is exceeded the deadman will be invoked every
944 \fBzfs_deadman_checktime_ms\fR milliseconds until the pool sync completes.
945 .sp
946 Default value: \fB600,000\fR.
947 .RE
948
949 .sp
950 .ne 2
951 .na
952 \fBzfs_deadman_ziotime_ms\fR (ulong)
953 .ad
954 .RS 12n
955 Interval in milliseconds after which the deadman is triggered and an
956 individual IO operation is considered to be "hung". As long as the I/O
957 remains "hung" the deadman will be invoked every \fBzfs_deadman_checktime_ms\fR
958 milliseconds until the I/O completes.
959 .sp
960 Default value: \fB300,000\fR.
961 .RE
962
963 .sp
964 .ne 2
965 .na
966 \fBzfs_dedup_prefetch\fR (int)
967 .ad
968 .RS 12n
969 Enable prefetching dedup-ed blks
970 .sp
971 Use \fB1\fR for yes and \fB0\fR to disable (default).
972 .RE
973
974 .sp
975 .ne 2
976 .na
977 \fBzfs_delay_min_dirty_percent\fR (int)
978 .ad
979 .RS 12n
980 Start to delay each transaction once there is this amount of dirty data,
981 expressed as a percentage of \fBzfs_dirty_data_max\fR.
982 This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
983 See the section "ZFS TRANSACTION DELAY".
984 .sp
985 Default value: \fB60\fR%.
986 .RE
987
988 .sp
989 .ne 2
990 .na
991 \fBzfs_delay_scale\fR (int)
992 .ad
993 .RS 12n
994 This controls how quickly the transaction delay approaches infinity.
995 Larger values cause longer delays for a given amount of dirty data.
996 .sp
997 For the smoothest delay, this value should be about 1 billion divided
998 by the maximum number of operations per second. This will smoothly
999 handle between 10x and 1/10th this number.
1000 .sp
1001 See the section "ZFS TRANSACTION DELAY".
1002 .sp
1003 Note: \fBzfs_delay_scale\fR * \fBzfs_dirty_data_max\fR must be < 2^64.
1004 .sp
1005 Default value: \fB500,000\fR.
1006 .RE
1007
1008 .sp
1009 .ne 2
1010 .na
1011 \fBzfs_delays_per_second\fR (int)
1012 .ad
1013 .RS 12n
1014 Rate limit IO delay events to this many per second.
1015 .sp
1016 Default value: 20
1017 .RE
1018
1019 .sp
1020 .ne 2
1021 .na
1022 \fBzfs_delete_blocks\fR (ulong)
1023 .ad
1024 .RS 12n
1025 This is the used to define a large file for the purposes of delete. Files
1026 containing more than \fBzfs_delete_blocks\fR will be deleted asynchronously
1027 while smaller files are deleted synchronously. Decreasing this value will
1028 reduce the time spent in an unlink(2) system call at the expense of a longer
1029 delay before the freed space is available.
1030 .sp
1031 Default value: \fB20,480\fR.
1032 .RE
1033
1034 .sp
1035 .ne 2
1036 .na
1037 \fBzfs_dirty_data_max\fR (int)
1038 .ad
1039 .RS 12n
1040 Determines the dirty space limit in bytes. Once this limit is exceeded, new
1041 writes are halted until space frees up. This parameter takes precedence
1042 over \fBzfs_dirty_data_max_percent\fR.
1043 See the section "ZFS TRANSACTION DELAY".
1044 .sp
1045 Default value: \fB10\fR% of physical RAM, capped at \fBzfs_dirty_data_max_max\fR.
1046 .RE
1047
1048 .sp
1049 .ne 2
1050 .na
1051 \fBzfs_dirty_data_max_max\fR (int)
1052 .ad
1053 .RS 12n
1054 Maximum allowable value of \fBzfs_dirty_data_max\fR, expressed in bytes.
1055 This limit is only enforced at module load time, and will be ignored if
1056 \fBzfs_dirty_data_max\fR is later changed. This parameter takes
1057 precedence over \fBzfs_dirty_data_max_max_percent\fR. See the section
1058 "ZFS TRANSACTION DELAY".
1059 .sp
1060 Default value: \fB25\fR% of physical RAM.
1061 .RE
1062
1063 .sp
1064 .ne 2
1065 .na
1066 \fBzfs_dirty_data_max_max_percent\fR (int)
1067 .ad
1068 .RS 12n
1069 Maximum allowable value of \fBzfs_dirty_data_max\fR, expressed as a
1070 percentage of physical RAM. This limit is only enforced at module load
1071 time, and will be ignored if \fBzfs_dirty_data_max\fR is later changed.
1072 The parameter \fBzfs_dirty_data_max_max\fR takes precedence over this
1073 one. See the section "ZFS TRANSACTION DELAY".
1074 .sp
1075 Default value: \fB25\fR%.
1076 .RE
1077
1078 .sp
1079 .ne 2
1080 .na
1081 \fBzfs_dirty_data_max_percent\fR (int)
1082 .ad
1083 .RS 12n
1084 Determines the dirty space limit, expressed as a percentage of all
1085 memory. Once this limit is exceeded, new writes are halted until space frees
1086 up. The parameter \fBzfs_dirty_data_max\fR takes precedence over this
1087 one. See the section "ZFS TRANSACTION DELAY".
1088 .sp
1089 Default value: \fB10\fR%, subject to \fBzfs_dirty_data_max_max\fR.
1090 .RE
1091
1092 .sp
1093 .ne 2
1094 .na
1095 \fBzfs_dirty_data_sync\fR (int)
1096 .ad
1097 .RS 12n
1098 Start syncing out a transaction group if there is at least this much dirty data.
1099 .sp
1100 Default value: \fB67,108,864\fR.
1101 .RE
1102
1103 .sp
1104 .ne 2
1105 .na
1106 \fBzfs_fletcher_4_impl\fR (string)
1107 .ad
1108 .RS 12n
1109 Select a fletcher 4 implementation.
1110 .sp
1111 Supported selectors are: \fBfastest\fR, \fBscalar\fR, \fBsse2\fR, \fBssse3\fR,
1112 \fBavx2\fR, \fBavx512f\fR, and \fBaarch64_neon\fR.
1113 All of the selectors except \fBfastest\fR and \fBscalar\fR require instruction
1114 set extensions to be available and will only appear if ZFS detects that they are
1115 present at runtime. If multiple implementations of fletcher 4 are available,
1116 the \fBfastest\fR will be chosen using a micro benchmark. Selecting \fBscalar\fR
1117 results in the original, CPU based calculation, being used. Selecting any option
1118 other than \fBfastest\fR and \fBscalar\fR results in vector instructions from
1119 the respective CPU instruction set being used.
1120 .sp
1121 Default value: \fBfastest\fR.
1122 .RE
1123
1124 .sp
1125 .ne 2
1126 .na
1127 \fBzfs_free_bpobj_enabled\fR (int)
1128 .ad
1129 .RS 12n
1130 Enable/disable the processing of the free_bpobj object.
1131 .sp
1132 Default value: \fB1\fR.
1133 .RE
1134
1135 .sp
1136 .ne 2
1137 .na
1138 \fBzfs_async_block_max_blocks\fR (ulong)
1139 .ad
1140 .RS 12n
1141 Maximum number of blocks freed in a single txg.
1142 .sp
1143 Default value: \fB100,000\fR.
1144 .RE
1145
1146 .sp
1147 .ne 2
1148 .na
1149 \fBzfs_vdev_async_read_max_active\fR (int)
1150 .ad
1151 .RS 12n
1152 Maximum asynchronous read I/Os active to each device.
1153 See the section "ZFS I/O SCHEDULER".
1154 .sp
1155 Default value: \fB3\fR.
1156 .RE
1157
1158 .sp
1159 .ne 2
1160 .na
1161 \fBzfs_vdev_async_read_min_active\fR (int)
1162 .ad
1163 .RS 12n
1164 Minimum asynchronous read I/Os active to each device.
1165 See the section "ZFS I/O SCHEDULER".
1166 .sp
1167 Default value: \fB1\fR.
1168 .RE
1169
1170 .sp
1171 .ne 2
1172 .na
1173 \fBzfs_vdev_async_write_active_max_dirty_percent\fR (int)
1174 .ad
1175 .RS 12n
1176 When the pool has more than
1177 \fBzfs_vdev_async_write_active_max_dirty_percent\fR dirty data, use
1178 \fBzfs_vdev_async_write_max_active\fR to limit active async writes. If
1179 the dirty data is between min and max, the active I/O limit is linearly
1180 interpolated. See the section "ZFS I/O SCHEDULER".
1181 .sp
1182 Default value: \fB60\fR%.
1183 .RE
1184
1185 .sp
1186 .ne 2
1187 .na
1188 \fBzfs_vdev_async_write_active_min_dirty_percent\fR (int)
1189 .ad
1190 .RS 12n
1191 When the pool has less than
1192 \fBzfs_vdev_async_write_active_min_dirty_percent\fR dirty data, use
1193 \fBzfs_vdev_async_write_min_active\fR to limit active async writes. If
1194 the dirty data is between min and max, the active I/O limit is linearly
1195 interpolated. See the section "ZFS I/O SCHEDULER".
1196 .sp
1197 Default value: \fB30\fR%.
1198 .RE
1199
1200 .sp
1201 .ne 2
1202 .na
1203 \fBzfs_vdev_async_write_max_active\fR (int)
1204 .ad
1205 .RS 12n
1206 Maximum asynchronous write I/Os active to each device.
1207 See the section "ZFS I/O SCHEDULER".
1208 .sp
1209 Default value: \fB10\fR.
1210 .RE
1211
1212 .sp
1213 .ne 2
1214 .na
1215 \fBzfs_vdev_async_write_min_active\fR (int)
1216 .ad
1217 .RS 12n
1218 Minimum asynchronous write I/Os active to each device.
1219 See the section "ZFS I/O SCHEDULER".
1220 .sp
1221 Lower values are associated with better latency on rotational media but poorer
1222 resilver performance. The default value of 2 was chosen as a compromise. A
1223 value of 3 has been shown to improve resilver performance further at a cost of
1224 further increasing latency.
1225 .sp
1226 Default value: \fB2\fR.
1227 .RE
1228
1229 .sp
1230 .ne 2
1231 .na
1232 \fBzfs_vdev_max_active\fR (int)
1233 .ad
1234 .RS 12n
1235 The maximum number of I/Os active to each device. Ideally, this will be >=
1236 the sum of each queue's max_active. It must be at least the sum of each
1237 queue's min_active. See the section "ZFS I/O SCHEDULER".
1238 .sp
1239 Default value: \fB1,000\fR.
1240 .RE
1241
1242 .sp
1243 .ne 2
1244 .na
1245 \fBzfs_vdev_scrub_max_active\fR (int)
1246 .ad
1247 .RS 12n
1248 Maximum scrub I/Os active to each device.
1249 See the section "ZFS I/O SCHEDULER".
1250 .sp
1251 Default value: \fB2\fR.
1252 .RE
1253
1254 .sp
1255 .ne 2
1256 .na
1257 \fBzfs_vdev_scrub_min_active\fR (int)
1258 .ad
1259 .RS 12n
1260 Minimum scrub I/Os active to each device.
1261 See the section "ZFS I/O SCHEDULER".
1262 .sp
1263 Default value: \fB1\fR.
1264 .RE
1265
1266 .sp
1267 .ne 2
1268 .na
1269 \fBzfs_vdev_sync_read_max_active\fR (int)
1270 .ad
1271 .RS 12n
1272 Maximum synchronous read I/Os active to each device.
1273 See the section "ZFS I/O SCHEDULER".
1274 .sp
1275 Default value: \fB10\fR.
1276 .RE
1277
1278 .sp
1279 .ne 2
1280 .na
1281 \fBzfs_vdev_sync_read_min_active\fR (int)
1282 .ad
1283 .RS 12n
1284 Minimum synchronous read I/Os active to each device.
1285 See the section "ZFS I/O SCHEDULER".
1286 .sp
1287 Default value: \fB10\fR.
1288 .RE
1289
1290 .sp
1291 .ne 2
1292 .na
1293 \fBzfs_vdev_sync_write_max_active\fR (int)
1294 .ad
1295 .RS 12n
1296 Maximum synchronous write I/Os active to each device.
1297 See the section "ZFS I/O SCHEDULER".
1298 .sp
1299 Default value: \fB10\fR.
1300 .RE
1301
1302 .sp
1303 .ne 2
1304 .na
1305 \fBzfs_vdev_sync_write_min_active\fR (int)
1306 .ad
1307 .RS 12n
1308 Minimum synchronous write I/Os active to each device.
1309 See the section "ZFS I/O SCHEDULER".
1310 .sp
1311 Default value: \fB10\fR.
1312 .RE
1313
1314 .sp
1315 .ne 2
1316 .na
1317 \fBzfs_vdev_queue_depth_pct\fR (int)
1318 .ad
1319 .RS 12n
1320 Maximum number of queued allocations per top-level vdev expressed as
1321 a percentage of \fBzfs_vdev_async_write_max_active\fR which allows the
1322 system to detect devices that are more capable of handling allocations
1323 and to allocate more blocks to those devices. It allows for dynamic
1324 allocation distribution when devices are imbalanced as fuller devices
1325 will tend to be slower than empty devices.
1326
1327 See also \fBzio_dva_throttle_enabled\fR.
1328 .sp
1329 Default value: \fB1000\fR%.
1330 .RE
1331
1332 .sp
1333 .ne 2
1334 .na
1335 \fBzfs_expire_snapshot\fR (int)
1336 .ad
1337 .RS 12n
1338 Seconds to expire .zfs/snapshot
1339 .sp
1340 Default value: \fB300\fR.
1341 .RE
1342
1343 .sp
1344 .ne 2
1345 .na
1346 \fBzfs_admin_snapshot\fR (int)
1347 .ad
1348 .RS 12n
1349 Allow the creation, removal, or renaming of entries in the .zfs/snapshot
1350 directory to cause the creation, destruction, or renaming of snapshots.
1351 When enabled this functionality works both locally and over NFS exports
1352 which have the 'no_root_squash' option set. This functionality is disabled
1353 by default.
1354 .sp
1355 Use \fB1\fR for yes and \fB0\fR for no (default).
1356 .RE
1357
1358 .sp
1359 .ne 2
1360 .na
1361 \fBzfs_flags\fR (int)
1362 .ad
1363 .RS 12n
1364 Set additional debugging flags. The following flags may be bitwise-or'd
1365 together.
1366 .sp
1367 .TS
1368 box;
1369 rB lB
1370 lB lB
1371 r l.
1372 Value Symbolic Name
1373 Description
1374 _
1375 1 ZFS_DEBUG_DPRINTF
1376 Enable dprintf entries in the debug log.
1377 _
1378 2 ZFS_DEBUG_DBUF_VERIFY *
1379 Enable extra dbuf verifications.
1380 _
1381 4 ZFS_DEBUG_DNODE_VERIFY *
1382 Enable extra dnode verifications.
1383 _
1384 8 ZFS_DEBUG_SNAPNAMES
1385 Enable snapshot name verification.
1386 _
1387 16 ZFS_DEBUG_MODIFY
1388 Check for illegally modified ARC buffers.
1389 _
1390 32 ZFS_DEBUG_SPA
1391 Enable spa_dbgmsg entries in the debug log.
1392 _
1393 64 ZFS_DEBUG_ZIO_FREE
1394 Enable verification of block frees.
1395 _
1396 128 ZFS_DEBUG_HISTOGRAM_VERIFY
1397 Enable extra spacemap histogram verifications.
1398 _
1399 256 ZFS_DEBUG_METASLAB_VERIFY
1400 Verify space accounting on disk matches in-core range_trees.
1401 _
1402 512 ZFS_DEBUG_SET_ERROR
1403 Enable SET_ERROR and dprintf entries in the debug log.
1404 .TE
1405 .sp
1406 * Requires debug build.
1407 .sp
1408 Default value: \fB0\fR.
1409 .RE
1410
1411 .sp
1412 .ne 2
1413 .na
1414 \fBzfs_free_leak_on_eio\fR (int)
1415 .ad
1416 .RS 12n
1417 If destroy encounters an EIO while reading metadata (e.g. indirect
1418 blocks), space referenced by the missing metadata can not be freed.
1419 Normally this causes the background destroy to become "stalled", as
1420 it is unable to make forward progress. While in this stalled state,
1421 all remaining space to free from the error-encountering filesystem is
1422 "temporarily leaked". Set this flag to cause it to ignore the EIO,
1423 permanently leak the space from indirect blocks that can not be read,
1424 and continue to free everything else that it can.
1425
1426 The default, "stalling" behavior is useful if the storage partially
1427 fails (i.e. some but not all i/os fail), and then later recovers. In
1428 this case, we will be able to continue pool operations while it is
1429 partially failed, and when it recovers, we can continue to free the
1430 space, with no leaks. However, note that this case is actually
1431 fairly rare.
1432
1433 Typically pools either (a) fail completely (but perhaps temporarily,
1434 e.g. a top-level vdev going offline), or (b) have localized,
1435 permanent errors (e.g. disk returns the wrong data due to bit flip or
1436 firmware bug). In case (a), this setting does not matter because the
1437 pool will be suspended and the sync thread will not be able to make
1438 forward progress regardless. In case (b), because the error is
1439 permanent, the best we can do is leak the minimum amount of space,
1440 which is what setting this flag will do. Therefore, it is reasonable
1441 for this flag to normally be set, but we chose the more conservative
1442 approach of not setting it, so that there is no possibility of
1443 leaking space in the "partial temporary" failure case.
1444 .sp
1445 Default value: \fB0\fR.
1446 .RE
1447
1448 .sp
1449 .ne 2
1450 .na
1451 \fBzfs_free_min_time_ms\fR (int)
1452 .ad
1453 .RS 12n
1454 During a \fBzfs destroy\fR operation using \fBfeature@async_destroy\fR a minimum
1455 of this much time will be spent working on freeing blocks per txg.
1456 .sp
1457 Default value: \fB1,000\fR.
1458 .RE
1459
1460 .sp
1461 .ne 2
1462 .na
1463 \fBzfs_immediate_write_sz\fR (long)
1464 .ad
1465 .RS 12n
1466 Largest data block to write to zil. Larger blocks will be treated as if the
1467 dataset being written to had the property setting \fBlogbias=throughput\fR.
1468 .sp
1469 Default value: \fB32,768\fR.
1470 .RE
1471
1472 .sp
1473 .ne 2
1474 .na
1475 \fBzfs_max_recordsize\fR (int)
1476 .ad
1477 .RS 12n
1478 We currently support block sizes from 512 bytes to 16MB. The benefits of
1479 larger blocks, and thus larger IO, need to be weighed against the cost of
1480 COWing a giant block to modify one byte. Additionally, very large blocks
1481 can have an impact on i/o latency, and also potentially on the memory
1482 allocator. Therefore, we do not allow the recordsize to be set larger than
1483 zfs_max_recordsize (default 1MB). Larger blocks can be created by changing
1484 this tunable, and pools with larger blocks can always be imported and used,
1485 regardless of this setting.
1486 .sp
1487 Default value: \fB1,048,576\fR.
1488 .RE
1489
1490 .sp
1491 .ne 2
1492 .na
1493 \fBzfs_metaslab_fragmentation_threshold\fR (int)
1494 .ad
1495 .RS 12n
1496 Allow metaslabs to keep their active state as long as their fragmentation
1497 percentage is less than or equal to this value. An active metaslab that
1498 exceeds this threshold will no longer keep its active status allowing
1499 better metaslabs to be selected.
1500 .sp
1501 Default value: \fB70\fR.
1502 .RE
1503
1504 .sp
1505 .ne 2
1506 .na
1507 \fBzfs_mg_fragmentation_threshold\fR (int)
1508 .ad
1509 .RS 12n
1510 Metaslab groups are considered eligible for allocations if their
1511 fragmentation metric (measured as a percentage) is less than or equal to
1512 this value. If a metaslab group exceeds this threshold then it will be
1513 skipped unless all metaslab groups within the metaslab class have also
1514 crossed this threshold.
1515 .sp
1516 Default value: \fB85\fR.
1517 .RE
1518
1519 .sp
1520 .ne 2
1521 .na
1522 \fBzfs_mg_noalloc_threshold\fR (int)
1523 .ad
1524 .RS 12n
1525 Defines a threshold at which metaslab groups should be eligible for
1526 allocations. The value is expressed as a percentage of free space
1527 beyond which a metaslab group is always eligible for allocations.
1528 If a metaslab group's free space is less than or equal to the
1529 threshold, the allocator will avoid allocating to that group
1530 unless all groups in the pool have reached the threshold. Once all
1531 groups have reached the threshold, all groups are allowed to accept
1532 allocations. The default value of 0 disables the feature and causes
1533 all metaslab groups to be eligible for allocations.
1534
1535 This parameter allows one to deal with pools having heavily imbalanced
1536 vdevs such as would be the case when a new vdev has been added.
1537 Setting the threshold to a non-zero percentage will stop allocations
1538 from being made to vdevs that aren't filled to the specified percentage
1539 and allow lesser filled vdevs to acquire more allocations than they
1540 otherwise would under the old \fBzfs_mg_alloc_failures\fR facility.
1541 .sp
1542 Default value: \fB0\fR.
1543 .RE
1544
1545 .sp
1546 .ne 2
1547 .na
1548 \fBzfs_multihost_history\fR (int)
1549 .ad
1550 .RS 12n
1551 Historical statistics for the last N multihost updates will be available in
1552 \fB/proc/spl/kstat/zfs/<pool>/multihost\fR
1553 .sp
1554 Default value: \fB0\fR.
1555 .RE
1556
1557 .sp
1558 .ne 2
1559 .na
1560 \fBzfs_multihost_interval\fR (ulong)
1561 .ad
1562 .RS 12n
1563 Used to control the frequency of multihost writes which are performed when the
1564 \fBmultihost\fR pool property is on. This is one factor used to determine
1565 the length of the activity check during import.
1566 .sp
1567 The multihost write period is \fBzfs_multihost_interval / leaf-vdevs\fR milliseconds.
1568 This means that on average a multihost write will be issued for each leaf vdev every
1569 \fBzfs_multihost_interval\fR milliseconds. In practice, the observed period can
1570 vary with the I/O load and this observed value is the delay which is stored in
1571 the uberblock.
1572 .sp
1573 On import the activity check waits a minimum amount of time determined by
1574 \fBzfs_multihost_interval * zfs_multihost_import_intervals\fR. The activity
1575 check time may be further extended if the value of mmp delay found in the best
1576 uberblock indicates actual multihost updates happened at longer intervals than
1577 \fBzfs_multihost_interval\fR. A minimum value of \fB100ms\fR is enforced.
1578 .sp
1579 Default value: \fB1000\fR.
1580 .RE
1581
1582 .sp
1583 .ne 2
1584 .na
1585 \fBzfs_multihost_import_intervals\fR (uint)
1586 .ad
1587 .RS 12n
1588 Used to control the duration of the activity test on import. Smaller values of
1589 \fBzfs_multihost_import_intervals\fR will reduce the import time but increase
1590 the risk of failing to detect an active pool. The total activity check time is
1591 never allowed to drop below one second. A value of 0 is ignored and treated as
1592 if it was set to 1
1593 .sp
1594 Default value: \fB10\fR.
1595 .RE
1596
1597 .sp
1598 .ne 2
1599 .na
1600 \fBzfs_multihost_fail_intervals\fR (uint)
1601 .ad
1602 .RS 12n
1603 Controls the behavior of the pool when multihost write failures are detected.
1604 .sp
1605 When \fBzfs_multihost_fail_intervals = 0\fR then multihost write failures are ignored.
1606 The failures will still be reported to the ZED which depending on its
1607 configuration may take action such as suspending the pool or offlining a device.
1608 .sp
1609 When \fBzfs_multihost_fail_intervals > 0\fR then sequential multihost write failures
1610 will cause the pool to be suspended. This occurs when
1611 \fBzfs_multihost_fail_intervals * zfs_multihost_interval\fR milliseconds have
1612 passed since the last successful multihost write. This guarantees the activity test
1613 will see multihost writes if the pool is imported.
1614 .sp
1615 Default value: \fB5\fR.
1616 .RE
1617
1618 .sp
1619 .ne 2
1620 .na
1621 \fBzfs_no_scrub_io\fR (int)
1622 .ad
1623 .RS 12n
1624 Set for no scrub I/O. This results in scrubs not actually scrubbing data and
1625 simply doing a metadata crawl of the pool instead.
1626 .sp
1627 Use \fB1\fR for yes and \fB0\fR for no (default).
1628 .RE
1629
1630 .sp
1631 .ne 2
1632 .na
1633 \fBzfs_no_scrub_prefetch\fR (int)
1634 .ad
1635 .RS 12n
1636 Set to disable block prefetching for scrubs.
1637 .sp
1638 Use \fB1\fR for yes and \fB0\fR for no (default).
1639 .RE
1640
1641 .sp
1642 .ne 2
1643 .na
1644 \fBzfs_nocacheflush\fR (int)
1645 .ad
1646 .RS 12n
1647 Disable cache flush operations on disks when writing. Beware, this may cause
1648 corruption if disks re-order writes.
1649 .sp
1650 Use \fB1\fR for yes and \fB0\fR for no (default).
1651 .RE
1652
1653 .sp
1654 .ne 2
1655 .na
1656 \fBzfs_nopwrite_enabled\fR (int)
1657 .ad
1658 .RS 12n
1659 Enable NOP writes
1660 .sp
1661 Use \fB1\fR for yes (default) and \fB0\fR to disable.
1662 .RE
1663
1664 .sp
1665 .ne 2
1666 .na
1667 \fBzfs_dmu_offset_next_sync\fR (int)
1668 .ad
1669 .RS 12n
1670 Enable forcing txg sync to find holes. When enabled forces ZFS to act
1671 like prior versions when SEEK_HOLE or SEEK_DATA flags are used, which
1672 when a dnode is dirty causes txg's to be synced so that this data can be
1673 found.
1674 .sp
1675 Use \fB1\fR for yes and \fB0\fR to disable (default).
1676 .RE
1677
1678 .sp
1679 .ne 2
1680 .na
1681 \fBzfs_pd_bytes_max\fR (int)
1682 .ad
1683 .RS 12n
1684 The number of bytes which should be prefetched during a pool traversal
1685 (eg: \fBzfs send\fR or other data crawling operations)
1686 .sp
1687 Default value: \fB52,428,800\fR.
1688 .RE
1689
1690 .sp
1691 .ne 2
1692 .na
1693 \fBzfs_per_txg_dirty_frees_percent \fR (ulong)
1694 .ad
1695 .RS 12n
1696 Tunable to control percentage of dirtied blocks from frees in one TXG.
1697 After this threshold is crossed, additional dirty blocks from frees
1698 wait until the next TXG.
1699 A value of zero will disable this throttle.
1700 .sp
1701 Default value: \fB30\fR and \fB0\fR to disable.
1702 .RE
1703
1704
1705
1706 .sp
1707 .ne 2
1708 .na
1709 \fBzfs_prefetch_disable\fR (int)
1710 .ad
1711 .RS 12n
1712 This tunable disables predictive prefetch. Note that it leaves "prescient"
1713 prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch,
1714 prescient prefetch never issues i/os that end up not being needed, so it
1715 can't hurt performance.
1716 .sp
1717 Use \fB1\fR for yes and \fB0\fR for no (default).
1718 .RE
1719
1720 .sp
1721 .ne 2
1722 .na
1723 \fBzfs_read_chunk_size\fR (long)
1724 .ad
1725 .RS 12n
1726 Bytes to read per chunk
1727 .sp
1728 Default value: \fB1,048,576\fR.
1729 .RE
1730
1731 .sp
1732 .ne 2
1733 .na
1734 \fBzfs_read_history\fR (int)
1735 .ad
1736 .RS 12n
1737 Historical statistics for the last N reads will be available in
1738 \fB/proc/spl/kstat/zfs/<pool>/reads\fR
1739 .sp
1740 Default value: \fB0\fR (no data is kept).
1741 .RE
1742
1743 .sp
1744 .ne 2
1745 .na
1746 \fBzfs_read_history_hits\fR (int)
1747 .ad
1748 .RS 12n
1749 Include cache hits in read history
1750 .sp
1751 Use \fB1\fR for yes and \fB0\fR for no (default).
1752 .RE
1753
1754 .sp
1755 .ne 2
1756 .na
1757 \fBzfs_reconstruct_indirect_combinations_max\fR (int)
1758 .ad
1759 .RS 12na
1760 If an indirect split block contains more than this many possible unique
1761 combinations when being reconstructed, consider it too computationally
1762 expensive to check them all. Instead, try at most
1763 \fBzfs_reconstruct_indirect_combinations_max\fR randomly-selected
1764 combinations each time the block is accessed. This allows all segment
1765 copies to participate fairly in the reconstruction when all combinations
1766 cannot be checked and prevents repeated use of one bad copy.
1767 .sp
1768 Default value: \fB100\fR.
1769 .RE
1770
1771 .sp
1772 .ne 2
1773 .na
1774 \fBzfs_recover\fR (int)
1775 .ad
1776 .RS 12n
1777 Set to attempt to recover from fatal errors. This should only be used as a
1778 last resort, as it typically results in leaked space, or worse.
1779 .sp
1780 Use \fB1\fR for yes and \fB0\fR for no (default).
1781 .RE
1782
1783 .sp
1784 .ne 2
1785 .na
1786 \fBzfs_resilver_min_time_ms\fR (int)
1787 .ad
1788 .RS 12n
1789 Resilvers are processed by the sync thread. While resilvering it will spend
1790 at least this much time working on a resilver between txg flushes.
1791 .sp
1792 Default value: \fB3,000\fR.
1793 .RE
1794
1795 .sp
1796 .ne 2
1797 .na
1798 \fBzfs_scan_ignore_errors\fR (int)
1799 .ad
1800 .RS 12n
1801 If set to a nonzero value, remove the DTL (dirty time list) upon
1802 completion of a pool scan (scrub) even if there were unrepairable
1803 errors. It is intended to be used during pool repair or recovery to
1804 stop resilvering when the pool is next imported.
1805 .sp
1806 Default value: \fB0\fR.
1807 .RE
1808
1809 .sp
1810 .ne 2
1811 .na
1812 \fBzfs_scrub_min_time_ms\fR (int)
1813 .ad
1814 .RS 12n
1815 Scrubs are processed by the sync thread. While scrubbing it will spend
1816 at least this much time working on a scrub between txg flushes.
1817 .sp
1818 Default value: \fB1,000\fR.
1819 .RE
1820
1821 .sp
1822 .ne 2
1823 .na
1824 \fBzfs_scan_checkpoint_intval\fR (int)
1825 .ad
1826 .RS 12n
1827 To preserve progress across reboots the sequential scan algorithm periodically
1828 needs to stop metadata scanning and issue all the verifications I/Os to disk.
1829 The frequency of this flushing is determined by the
1830 \fBfBzfs_scan_checkpoint_intval\fR tunable.
1831 .sp
1832 Default value: \fB7200\fR seconds (every 2 hours).
1833 .RE
1834
1835 .sp
1836 .ne 2
1837 .na
1838 \fBzfs_scan_fill_weight\fR (int)
1839 .ad
1840 .RS 12n
1841 This tunable affects how scrub and resilver I/O segments are ordered. A higher
1842 number indicates that we care more about how filled in a segment is, while a
1843 lower number indicates we care more about the size of the extent without
1844 considering the gaps within a segment. This value is only tunable upon module
1845 insertion. Changing the value afterwards will have no affect on scrub or
1846 resilver performance.
1847 .sp
1848 Default value: \fB3\fR.
1849 .RE
1850
1851 .sp
1852 .ne 2
1853 .na
1854 \fBzfs_scan_issue_strategy\fR (int)
1855 .ad
1856 .RS 12n
1857 Determines the order that data will be verified while scrubbing or resilvering.
1858 If set to \fB1\fR, data will be verified as sequentially as possible, given the
1859 amount of memory reserved for scrubbing (see \fBzfs_scan_mem_lim_fact\fR). This
1860 may improve scrub performance if the pool's data is very fragmented. If set to
1861 \fB2\fR, the largest mostly-contiguous chunk of found data will be verified
1862 first. By deferring scrubbing of small segments, we may later find adjacent data
1863 to coalesce and increase the segment size. If set to \fB0\fR, zfs will use
1864 strategy \fB1\fR during normal verification and strategy \fB2\fR while taking a
1865 checkpoint.
1866 .sp
1867 Default value: \fB0\fR.
1868 .RE
1869
1870 .sp
1871 .ne 2
1872 .na
1873 \fBzfs_scan_legacy\fR (int)
1874 .ad
1875 .RS 12n
1876 A value of 0 indicates that scrubs and resilvers will gather metadata in
1877 memory before issuing sequential I/O. A value of 1 indicates that the legacy
1878 algorithm will be used where I/O is initiated as soon as it is discovered.
1879 Changing this value to 0 will not affect scrubs or resilvers that are already
1880 in progress.
1881 .sp
1882 Default value: \fB0\fR.
1883 .RE
1884
1885 .sp
1886 .ne 2
1887 .na
1888 \fBzfs_scan_max_ext_gap\fR (int)
1889 .ad
1890 .RS 12n
1891 Indicates the largest gap in bytes between scrub / resilver I/Os that will still
1892 be considered sequential for sorting purposes. Changing this value will not
1893 affect scrubs or resilvers that are already in progress.
1894 .sp
1895 Default value: \fB2097152 (2 MB)\fR.
1896 .RE
1897
1898 .sp
1899 .ne 2
1900 .na
1901 \fBzfs_scan_mem_lim_fact\fR (int)
1902 .ad
1903 .RS 12n
1904 Maximum fraction of RAM used for I/O sorting by sequential scan algorithm.
1905 This tunable determines the hard limit for I/O sorting memory usage.
1906 When the hard limit is reached we stop scanning metadata and start issuing
1907 data verification I/O. This is done until we get below the soft limit.
1908 .sp
1909 Default value: \fB20\fR which is 5% of RAM (1/20).
1910 .RE
1911
1912 .sp
1913 .ne 2
1914 .na
1915 \fBzfs_scan_mem_lim_soft_fact\fR (int)
1916 .ad
1917 .RS 12n
1918 The fraction of the hard limit used to determined the soft limit for I/O sorting
1919 by the sequential scan algorithm. When we cross this limit from bellow no action
1920 is taken. When we cross this limit from above it is because we are issuing
1921 verification I/O. In this case (unless the metadata scan is done) we stop
1922 issuing verification I/O and start scanning metadata again until we get to the
1923 hard limit.
1924 .sp
1925 Default value: \fB20\fR which is 5% of the hard limit (1/20).
1926 .RE
1927
1928 .sp
1929 .ne 2
1930 .na
1931 \fBzfs_scan_vdev_limit\fR (int)
1932 .ad
1933 .RS 12n
1934 Maximum amount of data that can be concurrently issued at once for scrubs and
1935 resilvers per leaf device, given in bytes.
1936 .sp
1937 Default value: \fB41943040\fR.
1938 .RE
1939
1940 .sp
1941 .ne 2
1942 .na
1943 \fBzfs_send_corrupt_data\fR (int)
1944 .ad
1945 .RS 12n
1946 Allow sending of corrupt data (ignore read/checksum errors when sending data)
1947 .sp
1948 Use \fB1\fR for yes and \fB0\fR for no (default).
1949 .RE
1950
1951 .sp
1952 .ne 2
1953 .na
1954 \fBzfs_send_queue_length\fR (int)
1955 .ad
1956 .RS 12n
1957 The maximum number of bytes allowed in the \fBzfs send\fR queue. This value
1958 must be at least twice the maximum block size in use.
1959 .sp
1960 Default value: \fB16,777,216\fR.
1961 .RE
1962
1963 .sp
1964 .ne 2
1965 .na
1966 \fBzfs_recv_queue_length\fR (int)
1967 .ad
1968 .RS 12n
1969 .sp
1970 The maximum number of bytes allowed in the \fBzfs receive\fR queue. This value
1971 must be at least twice the maximum block size in use.
1972 .sp
1973 Default value: \fB16,777,216\fR.
1974 .RE
1975
1976 .sp
1977 .ne 2
1978 .na
1979 \fBzfs_sync_pass_deferred_free\fR (int)
1980 .ad
1981 .RS 12n
1982 Flushing of data to disk is done in passes. Defer frees starting in this pass
1983 .sp
1984 Default value: \fB2\fR.
1985 .RE
1986
1987 .sp
1988 .ne 2
1989 .na
1990 \fBzfs_sync_pass_dont_compress\fR (int)
1991 .ad
1992 .RS 12n
1993 Don't compress starting in this pass
1994 .sp
1995 Default value: \fB5\fR.
1996 .RE
1997
1998 .sp
1999 .ne 2
2000 .na
2001 \fBzfs_sync_pass_rewrite\fR (int)
2002 .ad
2003 .RS 12n
2004 Rewrite new block pointers starting in this pass
2005 .sp
2006 Default value: \fB2\fR.
2007 .RE
2008
2009 .sp
2010 .ne 2
2011 .na
2012 \fBzfs_sync_taskq_batch_pct\fR (int)
2013 .ad
2014 .RS 12n
2015 This controls the number of threads used by the dp_sync_taskq. The default
2016 value of 75% will create a maximum of one thread per cpu.
2017 .sp
2018 Default value: \fB75\fR%.
2019 .RE
2020
2021 .sp
2022 .ne 2
2023 .na
2024 \fBzfs_txg_history\fR (int)
2025 .ad
2026 .RS 12n
2027 Historical statistics for the last N txgs will be available in
2028 \fB/proc/spl/kstat/zfs/<pool>/txgs\fR
2029 .sp
2030 Default value: \fB0\fR.
2031 .RE
2032
2033 .sp
2034 .ne 2
2035 .na
2036 \fBzfs_txg_timeout\fR (int)
2037 .ad
2038 .RS 12n
2039 Flush dirty data to disk at least every N seconds (maximum txg duration)
2040 .sp
2041 Default value: \fB5\fR.
2042 .RE
2043
2044 .sp
2045 .ne 2
2046 .na
2047 \fBzfs_vdev_aggregation_limit\fR (int)
2048 .ad
2049 .RS 12n
2050 Max vdev I/O aggregation size
2051 .sp
2052 Default value: \fB131,072\fR.
2053 .RE
2054
2055 .sp
2056 .ne 2
2057 .na
2058 \fBzfs_vdev_cache_bshift\fR (int)
2059 .ad
2060 .RS 12n
2061 Shift size to inflate reads too
2062 .sp
2063 Default value: \fB16\fR (effectively 65536).
2064 .RE
2065
2066 .sp
2067 .ne 2
2068 .na
2069 \fBzfs_vdev_cache_max\fR (int)
2070 .ad
2071 .RS 12n
2072 Inflate reads smaller than this value to meet the \fBzfs_vdev_cache_bshift\fR
2073 size (default 64k).
2074 .sp
2075 Default value: \fB16384\fR.
2076 .RE
2077
2078 .sp
2079 .ne 2
2080 .na
2081 \fBzfs_vdev_cache_size\fR (int)
2082 .ad
2083 .RS 12n
2084 Total size of the per-disk cache in bytes.
2085 .sp
2086 Currently this feature is disabled as it has been found to not be helpful
2087 for performance and in some cases harmful.
2088 .sp
2089 Default value: \fB0\fR.
2090 .RE
2091
2092 .sp
2093 .ne 2
2094 .na
2095 \fBzfs_vdev_mirror_rotating_inc\fR (int)
2096 .ad
2097 .RS 12n
2098 A number by which the balancing algorithm increments the load calculation for
2099 the purpose of selecting the least busy mirror member when an I/O immediately
2100 follows its predecessor on rotational vdevs for the purpose of making decisions
2101 based on load.
2102 .sp
2103 Default value: \fB0\fR.
2104 .RE
2105
2106 .sp
2107 .ne 2
2108 .na
2109 \fBzfs_vdev_mirror_rotating_seek_inc\fR (int)
2110 .ad
2111 .RS 12n
2112 A number by which the balancing algorithm increments the load calculation for
2113 the purpose of selecting the least busy mirror member when an I/O lacks
2114 locality as defined by the zfs_vdev_mirror_rotating_seek_offset. I/Os within
2115 this that are not immediately following the previous I/O are incremented by
2116 half.
2117 .sp
2118 Default value: \fB5\fR.
2119 .RE
2120
2121 .sp
2122 .ne 2
2123 .na
2124 \fBzfs_vdev_mirror_rotating_seek_offset\fR (int)
2125 .ad
2126 .RS 12n
2127 The maximum distance for the last queued I/O in which the balancing algorithm
2128 considers an I/O to have locality.
2129 See the section "ZFS I/O SCHEDULER".
2130 .sp
2131 Default value: \fB1048576\fR.
2132 .RE
2133
2134 .sp
2135 .ne 2
2136 .na
2137 \fBzfs_vdev_mirror_non_rotating_inc\fR (int)
2138 .ad
2139 .RS 12n
2140 A number by which the balancing algorithm increments the load calculation for
2141 the purpose of selecting the least busy mirror member on non-rotational vdevs
2142 when I/Os do not immediately follow one another.
2143 .sp
2144 Default value: \fB0\fR.
2145 .RE
2146
2147 .sp
2148 .ne 2
2149 .na
2150 \fBzfs_vdev_mirror_non_rotating_seek_inc\fR (int)
2151 .ad
2152 .RS 12n
2153 A number by which the balancing algorithm increments the load calculation for
2154 the purpose of selecting the least busy mirror member when an I/O lacks
2155 locality as defined by the zfs_vdev_mirror_rotating_seek_offset. I/Os within
2156 this that are not immediately following the previous I/O are incremented by
2157 half.
2158 .sp
2159 Default value: \fB1\fR.
2160 .RE
2161
2162 .sp
2163 .ne 2
2164 .na
2165 \fBzfs_vdev_read_gap_limit\fR (int)
2166 .ad
2167 .RS 12n
2168 Aggregate read I/O operations if the gap on-disk between them is within this
2169 threshold.
2170 .sp
2171 Default value: \fB32,768\fR.
2172 .RE
2173
2174 .sp
2175 .ne 2
2176 .na
2177 \fBzfs_vdev_scheduler\fR (charp)
2178 .ad
2179 .RS 12n
2180 Set the Linux I/O scheduler on whole disk vdevs to this scheduler. Valid options
2181 are noop, cfq, bfq & deadline
2182 .sp
2183 Default value: \fBnoop\fR.
2184 .RE
2185
2186 .sp
2187 .ne 2
2188 .na
2189 \fBzfs_vdev_write_gap_limit\fR (int)
2190 .ad
2191 .RS 12n
2192 Aggregate write I/O over gap
2193 .sp
2194 Default value: \fB4,096\fR.
2195 .RE
2196
2197 .sp
2198 .ne 2
2199 .na
2200 \fBzfs_vdev_raidz_impl\fR (string)
2201 .ad
2202 .RS 12n
2203 Parameter for selecting raidz parity implementation to use.
2204
2205 Options marked (always) below may be selected on module load as they are
2206 supported on all systems.
2207 The remaining options may only be set after the module is loaded, as they
2208 are available only if the implementations are compiled in and supported
2209 on the running system.
2210
2211 Once the module is loaded, the content of
2212 /sys/module/zfs/parameters/zfs_vdev_raidz_impl will show available options
2213 with the currently selected one enclosed in [].
2214 Possible options are:
2215 fastest - (always) implementation selected using built-in benchmark
2216 original - (always) original raidz implementation
2217 scalar - (always) scalar raidz implementation
2218 sse2 - implementation using SSE2 instruction set (64bit x86 only)
2219 ssse3 - implementation using SSSE3 instruction set (64bit x86 only)
2220 avx2 - implementation using AVX2 instruction set (64bit x86 only)
2221 avx512f - implementation using AVX512F instruction set (64bit x86 only)
2222 avx512bw - implementation using AVX512F & AVX512BW instruction sets (64bit x86 only)
2223 aarch64_neon - implementation using NEON (Aarch64/64 bit ARMv8 only)
2224 aarch64_neonx2 - implementation using NEON with more unrolling (Aarch64/64 bit ARMv8 only)
2225 .sp
2226 Default value: \fBfastest\fR.
2227 .RE
2228
2229 .sp
2230 .ne 2
2231 .na
2232 \fBzfs_zevent_cols\fR (int)
2233 .ad
2234 .RS 12n
2235 When zevents are logged to the console use this as the word wrap width.
2236 .sp
2237 Default value: \fB80\fR.
2238 .RE
2239
2240 .sp
2241 .ne 2
2242 .na
2243 \fBzfs_zevent_console\fR (int)
2244 .ad
2245 .RS 12n
2246 Log events to the console
2247 .sp
2248 Use \fB1\fR for yes and \fB0\fR for no (default).
2249 .RE
2250
2251 .sp
2252 .ne 2
2253 .na
2254 \fBzfs_zevent_len_max\fR (int)
2255 .ad
2256 .RS 12n
2257 Max event queue length. A value of 0 will result in a calculated value which
2258 increases with the number of CPUs in the system (minimum 64 events). Events
2259 in the queue can be viewed with the \fBzpool events\fR command.
2260 .sp
2261 Default value: \fB0\fR.
2262 .RE
2263
2264 .sp
2265 .ne 2
2266 .na
2267 \fBzfs_zil_clean_taskq_maxalloc\fR (int)
2268 .ad
2269 .RS 12n
2270 The maximum number of taskq entries that are allowed to be cached. When this
2271 limit is exceeded transaction records (itxs) will be cleaned synchronously.
2272 .sp
2273 Default value: \fB1048576\fR.
2274 .RE
2275
2276 .sp
2277 .ne 2
2278 .na
2279 \fBzfs_zil_clean_taskq_minalloc\fR (int)
2280 .ad
2281 .RS 12n
2282 The number of taskq entries that are pre-populated when the taskq is first
2283 created and are immediately available for use.
2284 .sp
2285 Default value: \fB1024\fR.
2286 .RE
2287
2288 .sp
2289 .ne 2
2290 .na
2291 \fBzfs_zil_clean_taskq_nthr_pct\fR (int)
2292 .ad
2293 .RS 12n
2294 This controls the number of threads used by the dp_zil_clean_taskq. The default
2295 value of 100% will create a maximum of one thread per cpu.
2296 .sp
2297 Default value: \fB100\fR%.
2298 .RE
2299
2300 .sp
2301 .ne 2
2302 .na
2303 \fBzil_replay_disable\fR (int)
2304 .ad
2305 .RS 12n
2306 Disable intent logging replay. Can be disabled for recovery from corrupted
2307 ZIL
2308 .sp
2309 Use \fB1\fR for yes and \fB0\fR for no (default).
2310 .RE
2311
2312 .sp
2313 .ne 2
2314 .na
2315 \fBzil_slog_bulk\fR (ulong)
2316 .ad
2317 .RS 12n
2318 Limit SLOG write size per commit executed with synchronous priority.
2319 Any writes above that will be executed with lower (asynchronous) priority
2320 to limit potential SLOG device abuse by single active ZIL writer.
2321 .sp
2322 Default value: \fB786,432\fR.
2323 .RE
2324
2325 .sp
2326 .ne 2
2327 .na
2328 \fBzio_delay_max\fR (int)
2329 .ad
2330 .RS 12n
2331 A zevent will be logged if a ZIO operation takes more than N milliseconds to
2332 complete. Note that this is only a logging facility, not a timeout on
2333 operations.
2334 .sp
2335 Default value: \fB30,000\fR.
2336 .RE
2337
2338 .sp
2339 .ne 2
2340 .na
2341 \fBzio_dva_throttle_enabled\fR (int)
2342 .ad
2343 .RS 12n
2344 Throttle block allocations in the ZIO pipeline. This allows for
2345 dynamic allocation distribution when devices are imbalanced.
2346 When enabled, the maximum number of pending allocations per top-level vdev
2347 is limited by \fBzfs_vdev_queue_depth_pct\fR.
2348 .sp
2349 Default value: \fB1\fR.
2350 .RE
2351
2352 .sp
2353 .ne 2
2354 .na
2355 \fBzio_requeue_io_start_cut_in_line\fR (int)
2356 .ad
2357 .RS 12n
2358 Prioritize requeued I/O
2359 .sp
2360 Default value: \fB0\fR.
2361 .RE
2362
2363 .sp
2364 .ne 2
2365 .na
2366 \fBzio_taskq_batch_pct\fR (uint)
2367 .ad
2368 .RS 12n
2369 Percentage of online CPUs (or CPU cores, etc) which will run a worker thread
2370 for IO. These workers are responsible for IO work such as compression and
2371 checksum calculations. Fractional number of CPUs will be rounded down.
2372 .sp
2373 The default value of 75 was chosen to avoid using all CPUs which can result in
2374 latency issues and inconsistent application performance, especially when high
2375 compression is enabled.
2376 .sp
2377 Default value: \fB75\fR.
2378 .RE
2379
2380 .sp
2381 .ne 2
2382 .na
2383 \fBzvol_inhibit_dev\fR (uint)
2384 .ad
2385 .RS 12n
2386 Do not create zvol device nodes. This may slightly improve startup time on
2387 systems with a very large number of zvols.
2388 .sp
2389 Use \fB1\fR for yes and \fB0\fR for no (default).
2390 .RE
2391
2392 .sp
2393 .ne 2
2394 .na
2395 \fBzvol_major\fR (uint)
2396 .ad
2397 .RS 12n
2398 Major number for zvol block devices
2399 .sp
2400 Default value: \fB230\fR.
2401 .RE
2402
2403 .sp
2404 .ne 2
2405 .na
2406 \fBzvol_max_discard_blocks\fR (ulong)
2407 .ad
2408 .RS 12n
2409 Discard (aka TRIM) operations done on zvols will be done in batches of this
2410 many blocks, where block size is determined by the \fBvolblocksize\fR property
2411 of a zvol.
2412 .sp
2413 Default value: \fB16,384\fR.
2414 .RE
2415
2416 .sp
2417 .ne 2
2418 .na
2419 \fBzvol_prefetch_bytes\fR (uint)
2420 .ad
2421 .RS 12n
2422 When adding a zvol to the system prefetch \fBzvol_prefetch_bytes\fR
2423 from the start and end of the volume. Prefetching these regions
2424 of the volume is desirable because they are likely to be accessed
2425 immediately by \fBblkid(8)\fR or by the kernel scanning for a partition
2426 table.
2427 .sp
2428 Default value: \fB131,072\fR.
2429 .RE
2430
2431 .sp
2432 .ne 2
2433 .na
2434 \fBzvol_request_sync\fR (uint)
2435 .ad
2436 .RS 12n
2437 When processing I/O requests for a zvol submit them synchronously. This
2438 effectively limits the queue depth to 1 for each I/O submitter. When set
2439 to 0 requests are handled asynchronously by a thread pool. The number of
2440 requests which can be handled concurrently is controller by \fBzvol_threads\fR.
2441 .sp
2442 Default value: \fB0\fR.
2443 .RE
2444
2445 .sp
2446 .ne 2
2447 .na
2448 \fBzvol_threads\fR (uint)
2449 .ad
2450 .RS 12n
2451 Max number of threads which can handle zvol I/O requests concurrently.
2452 .sp
2453 Default value: \fB32\fR.
2454 .RE
2455
2456 .sp
2457 .ne 2
2458 .na
2459 \fBzvol_volmode\fR (uint)
2460 .ad
2461 .RS 12n
2462 Defines zvol block devices behaviour when \fBvolmode\fR is set to \fBdefault\fR.
2463 Valid values are \fB1\fR (full), \fB2\fR (dev) and \fB3\fR (none).
2464 .sp
2465 Default value: \fB1\fR.
2466 .RE
2467
2468 .sp
2469 .ne 2
2470 .na
2471 \fBzfs_qat_disable\fR (int)
2472 .ad
2473 .RS 12n
2474 This tunable disables qat hardware acceleration for gzip compression and.
2475 AES-GCM encryption. It is available only if qat acceleration is compiled in
2476 and the qat driver is present.
2477 .sp
2478 Use \fB1\fR for yes and \fB0\fR for no (default).
2479 .RE
2480
2481 .SH ZFS I/O SCHEDULER
2482 ZFS issues I/O operations to leaf vdevs to satisfy and complete I/Os.
2483 The I/O scheduler determines when and in what order those operations are
2484 issued. The I/O scheduler divides operations into five I/O classes
2485 prioritized in the following order: sync read, sync write, async read,
2486 async write, and scrub/resilver. Each queue defines the minimum and
2487 maximum number of concurrent operations that may be issued to the
2488 device. In addition, the device has an aggregate maximum,
2489 \fBzfs_vdev_max_active\fR. Note that the sum of the per-queue minimums
2490 must not exceed the aggregate maximum. If the sum of the per-queue
2491 maximums exceeds the aggregate maximum, then the number of active I/Os
2492 may reach \fBzfs_vdev_max_active\fR, in which case no further I/Os will
2493 be issued regardless of whether all per-queue minimums have been met.
2494 .sp
2495 For many physical devices, throughput increases with the number of
2496 concurrent operations, but latency typically suffers. Further, physical
2497 devices typically have a limit at which more concurrent operations have no
2498 effect on throughput or can actually cause it to decrease.
2499 .sp
2500 The scheduler selects the next operation to issue by first looking for an
2501 I/O class whose minimum has not been satisfied. Once all are satisfied and
2502 the aggregate maximum has not been hit, the scheduler looks for classes
2503 whose maximum has not been satisfied. Iteration through the I/O classes is
2504 done in the order specified above. No further operations are issued if the
2505 aggregate maximum number of concurrent operations has been hit or if there
2506 are no operations queued for an I/O class that has not hit its maximum.
2507 Every time an I/O is queued or an operation completes, the I/O scheduler
2508 looks for new operations to issue.
2509 .sp
2510 In general, smaller max_active's will lead to lower latency of synchronous
2511 operations. Larger max_active's may lead to higher overall throughput,
2512 depending on underlying storage.
2513 .sp
2514 The ratio of the queues' max_actives determines the balance of performance
2515 between reads, writes, and scrubs. E.g., increasing
2516 \fBzfs_vdev_scrub_max_active\fR will cause the scrub or resilver to complete
2517 more quickly, but reads and writes to have higher latency and lower throughput.
2518 .sp
2519 All I/O classes have a fixed maximum number of outstanding operations
2520 except for the async write class. Asynchronous writes represent the data
2521 that is committed to stable storage during the syncing stage for
2522 transaction groups. Transaction groups enter the syncing state
2523 periodically so the number of queued async writes will quickly burst up
2524 and then bleed down to zero. Rather than servicing them as quickly as
2525 possible, the I/O scheduler changes the maximum number of active async
2526 write I/Os according to the amount of dirty data in the pool. Since
2527 both throughput and latency typically increase with the number of
2528 concurrent operations issued to physical devices, reducing the
2529 burstiness in the number of concurrent operations also stabilizes the
2530 response time of operations from other -- and in particular synchronous
2531 -- queues. In broad strokes, the I/O scheduler will issue more
2532 concurrent operations from the async write queue as there's more dirty
2533 data in the pool.
2534 .sp
2535 Async Writes
2536 .sp
2537 The number of concurrent operations issued for the async write I/O class
2538 follows a piece-wise linear function defined by a few adjustable points.
2539 .nf
2540
2541 | o---------| <-- zfs_vdev_async_write_max_active
2542 ^ | /^ |
2543 | | / | |
2544 active | / | |
2545 I/O | / | |
2546 count | / | |
2547 | / | |
2548 |-------o | | <-- zfs_vdev_async_write_min_active
2549 0|_______^______|_________|
2550 0% | | 100% of zfs_dirty_data_max
2551 | |
2552 | `-- zfs_vdev_async_write_active_max_dirty_percent
2553 `--------- zfs_vdev_async_write_active_min_dirty_percent
2554
2555 .fi
2556 Until the amount of dirty data exceeds a minimum percentage of the dirty
2557 data allowed in the pool, the I/O scheduler will limit the number of
2558 concurrent operations to the minimum. As that threshold is crossed, the
2559 number of concurrent operations issued increases linearly to the maximum at
2560 the specified maximum percentage of the dirty data allowed in the pool.
2561 .sp
2562 Ideally, the amount of dirty data on a busy pool will stay in the sloped
2563 part of the function between \fBzfs_vdev_async_write_active_min_dirty_percent\fR
2564 and \fBzfs_vdev_async_write_active_max_dirty_percent\fR. If it exceeds the
2565 maximum percentage, this indicates that the rate of incoming data is
2566 greater than the rate that the backend storage can handle. In this case, we
2567 must further throttle incoming writes, as described in the next section.
2568
2569 .SH ZFS TRANSACTION DELAY
2570 We delay transactions when we've determined that the backend storage
2571 isn't able to accommodate the rate of incoming writes.
2572 .sp
2573 If there is already a transaction waiting, we delay relative to when
2574 that transaction will finish waiting. This way the calculated delay time
2575 is independent of the number of threads concurrently executing
2576 transactions.
2577 .sp
2578 If we are the only waiter, wait relative to when the transaction
2579 started, rather than the current time. This credits the transaction for
2580 "time already served", e.g. reading indirect blocks.
2581 .sp
2582 The minimum time for a transaction to take is calculated as:
2583 .nf
2584 min_time = zfs_delay_scale * (dirty - min) / (max - dirty)
2585 min_time is then capped at 100 milliseconds.
2586 .fi
2587 .sp
2588 The delay has two degrees of freedom that can be adjusted via tunables. The
2589 percentage of dirty data at which we start to delay is defined by
2590 \fBzfs_delay_min_dirty_percent\fR. This should typically be at or above
2591 \fBzfs_vdev_async_write_active_max_dirty_percent\fR so that we only start to
2592 delay after writing at full speed has failed to keep up with the incoming write
2593 rate. The scale of the curve is defined by \fBzfs_delay_scale\fR. Roughly speaking,
2594 this variable determines the amount of delay at the midpoint of the curve.
2595 .sp
2596 .nf
2597 delay
2598 10ms +-------------------------------------------------------------*+
2599 | *|
2600 9ms + *+
2601 | *|
2602 8ms + *+
2603 | * |
2604 7ms + * +
2605 | * |
2606 6ms + * +
2607 | * |
2608 5ms + * +
2609 | * |
2610 4ms + * +
2611 | * |
2612 3ms + * +
2613 | * |
2614 2ms + (midpoint) * +
2615 | | ** |
2616 1ms + v *** +
2617 | zfs_delay_scale ----------> ******** |
2618 0 +-------------------------------------*********----------------+
2619 0% <- zfs_dirty_data_max -> 100%
2620 .fi
2621 .sp
2622 Note that since the delay is added to the outstanding time remaining on the
2623 most recent transaction, the delay is effectively the inverse of IOPS.
2624 Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
2625 was chosen such that small changes in the amount of accumulated dirty data
2626 in the first 3/4 of the curve yield relatively small differences in the
2627 amount of delay.
2628 .sp
2629 The effects can be easier to understand when the amount of delay is
2630 represented on a log scale:
2631 .sp
2632 .nf
2633 delay
2634 100ms +-------------------------------------------------------------++
2635 + +
2636 | |
2637 + *+
2638 10ms + *+
2639 + ** +
2640 | (midpoint) ** |
2641 + | ** +
2642 1ms + v **** +
2643 + zfs_delay_scale ----------> ***** +
2644 | **** |
2645 + **** +
2646 100us + ** +
2647 + * +
2648 | * |
2649 + * +
2650 10us + * +
2651 + +
2652 | |
2653 + +
2654 +--------------------------------------------------------------+
2655 0% <- zfs_dirty_data_max -> 100%
2656 .fi
2657 .sp
2658 Note here that only as the amount of dirty data approaches its limit does
2659 the delay start to increase rapidly. The goal of a properly tuned system
2660 should be to keep the amount of dirty data out of that range by first
2661 ensuring that the appropriate limits are set for the I/O scheduler to reach
2662 optimal throughput on the backend storage, and then by changing the value
2663 of \fBzfs_delay_scale\fR to increase the steepness of the curve.