Make zio_taskq_batch_pct user configurable

[mirror_zfs.git] / man / man5 / zfs-module-parameters.5
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5

index a54e31f8bc22a5f379200d9e5a042f1881db9acc..6be382eacafe9f8d7bf67719ff54612f9e133de0 100644 (file)
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -137,10 +137,47 @@ Default value: \fB8,388,608\fR.
  .sp
  .ne 2
  .na
-\fBmetaslab_debug\fR (int)
+\fBmetaslab_aliquot\fR (ulong)
  .ad
  .RS 12n
-Keep space maps in core to verify frees
+Metaslab granularity, in bytes. This is roughly similar to what would be
+referred to as the "stripe size" in traditional RAID arrays. In normal
+operation, ZFS will try to write this amount of data to a top-level vdev
+before moving on to the next one.
+.sp
+Default value: \fB524,288\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBmetaslab_bias_enabled\fR (int)
+.ad
+.RS 12n
+Enable metaslab group biasing based on its vdev's over- or under-utilization
+relative to the pool.
+.sp
+Use \fB1\fR for yes (default) and \fB0\fR for no.
+.RE
+
+.sp
+.ne 2
+.na
+\fBmetaslab_debug_load\fR (int)
+.ad
+.RS 12n
+Load all metaslabs during pool import.
+.sp
+Use \fB1\fR for yes and \fB0\fR for no (default).
+.RE
+
+.sp
+.ne 2
+.na
+\fBmetaslab_debug_unload\fR (int)
+.ad
+.RS 12n
+Prevent metaslabs from being unloaded.
  .sp
  Use \fB1\fR for yes and \fB0\fR for no (default).
  .RE
@@ -148,210 +185,777 @@ Use \fB1\fR for yes and \fB0\fR for no (default).
  .sp
  .ne 2
  .na
-\fBspa_config_path\fR (charp)
+\fBmetaslab_fragmentation_factor_enabled\fR (int)
+.ad
+.RS 12n
+Enable use of the fragmentation metric in computing metaslab weights.
+.sp
+Use \fB1\fR for yes (default) and \fB0\fR for no.
+.RE
+
+.sp
+.ne 2
+.na
+\fBmetaslabs_per_vdev\fR (int)
+.ad
+.RS 12n
+When a vdev is added, it will be divided into approximately (but no more than) this number of metaslabs.
+.sp
+Default value: \fB200\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBmetaslab_preload_enabled\fR (int)
+.ad
+.RS 12n
+Enable metaslab group preloading.
+.sp
+Use \fB1\fR for yes (default) and \fB0\fR for no.
+.RE
+
+.sp
+.ne 2
+.na
+\fBmetaslab_lba_weighting_enabled\fR (int)
+.ad
+.RS 12n
+Give more weight to metaslabs with lower LBAs, assuming they have
+greater bandwidth as is typically the case on a modern constant
+angular velocity disk drive.
+.sp
+Use \fB1\fR for yes (default) and \fB0\fR for no.
+.RE
+
+.sp
+.ne 2
+.na
+\fBspa_config_path\fR (charp)
+.ad
+.RS 12n
+SPA config file
+.sp
+Default value: \fB/etc/zfs/zpool.cache\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBspa_asize_inflation\fR (int)
+.ad
+.RS 12n
+Multiplication factor used to estimate actual disk consumption from the
+size of data being written. The default value is a worst case estimate,
+but lower values may be valid for a given pool depending on its
+configuration.  Pool administrators who understand the factors involved
+may wish to specify a more realistic inflation factor, particularly if
+they operate close to quota or capacity limits.
+.sp
+Default value: 24
+.RE
+
+.sp
+.ne 2
+.na
+\fBspa_load_verify_data\fR (int)
+.ad
+.RS 12n
+Whether to traverse data blocks during an "extreme rewind" (\fB-X\fR)
+import.  Use 0 to disable and 1 to enable.
+
+An extreme rewind import normally performs a full traversal of all
+blocks in the pool for verification.  If this parameter is set to 0,
+the traversal skips non-metadata blocks.  It can be toggled once the
+import has started to stop or start the traversal of non-metadata blocks.
+.sp
+Default value: 1
+.RE
+
+.sp
+.ne 2
+.na
+\fBspa_load_verify_metadata\fR (int)
+.ad
+.RS 12n
+Whether to traverse blocks during an "extreme rewind" (\fB-X\fR)
+pool import.  Use 0 to disable and 1 to enable.
+
+An extreme rewind import normally performs a full traversal of all
+blocks in the pool for verification.  If this parameter is set to 1,
+the traversal is not performed.  It can be toggled once the import has
+started to stop or start the traversal.
+.sp
+Default value: 1
+.RE
+
+.sp
+.ne 2
+.na
+\fBspa_load_verify_maxinflight\fR (int)
+.ad
+.RS 12n
+Maximum concurrent I/Os during the traversal performed during an "extreme
+rewind" (\fB-X\fR) pool import.
+.sp
+Default value: 10000
+.RE
+
+.sp
+.ne 2
+.na
+\fBspa_slop_shift\fR (int)
+.ad
+.RS 12n
+Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space
+in the pool to be consumed.  This ensures that we don't run the pool
+completely out of space, due to unaccounted changes (e.g. to the MOS).
+It also limits the worst-case time to allocate space.  If we have
+less than this amount of free space, most ZPL operations (e.g. write,
+create) will return ENOSPC.
+.sp
+Default value: 5
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfetch_array_rd_sz\fR (ulong)
+.ad
+.RS 12n
+If prefetching is enabled, disable prefetching for reads larger than this size.
+.sp
+Default value: \fB1,048,576\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfetch_block_cap\fR (uint)
+.ad
+.RS 12n
+Max number of blocks to prefetch at a time
+.sp
+Default value: \fB256\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfetch_max_streams\fR (uint)
+.ad
+.RS 12n
+Max number of streams per zfetch (prefetch streams per file).
+.sp
+Default value: \fB8\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfetch_min_sec_reap\fR (uint)
+.ad
+.RS 12n
+Min time before an active prefetch stream can be reclaimed
+.sp
+Default value: \fB2\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_arc_average_blocksize\fR (int)
+.ad
+.RS 12n
+The ARC's buffer hash table is sized based on the assumption of an average
+block size of \fBzfs_arc_average_blocksize\fR (default 8K).  This works out
+to roughly 1MB of hash table per 1GB of physical memory with 8-byte pointers.
+For configurations with a known larger average block size this value can be
+increased to reduce the memory footprint.
+
+.sp
+Default value: \fB8192\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_arc_evict_batch_limit\fR (int)
+.ad
+.RS 12n
+Number ARC headers to evict per sub-list before proceeding to another sub-list.
+This batch-style operation prevents entire sub-lists from being evicted at once
+but comes at a cost of additional unlocking and locking.
+.sp
+Default value: \fB10\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_arc_grow_retry\fR (int)
+.ad
+.RS 12n
+Seconds before growing arc size
+.sp
+Default value: \fB5\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_arc_lotsfree_percent\fR (int)
+.ad
+.RS 12n
+Throttle I/O when free system memory drops below this percentage of total
+system memory.  Setting this value to 0 will disable the throttle.
+.sp
+Default value: \fB10\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_arc_max\fR (ulong)
+.ad
+.RS 12n
+Max arc size
+.sp
+Default value: \fB0\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_arc_meta_limit\fR (ulong)
+.ad
+.RS 12n
+The maximum allowed size in bytes that meta data buffers are allowed to
+consume in the ARC.  When this limit is reached meta data buffers will
+be reclaimed even if the overall arc_c_max has not been reached.  This
+value defaults to 0 which indicates that 3/4 of the ARC may be used
+for meta data.
+.sp
+Default value: \fB0\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_arc_meta_min\fR (ulong)
+.ad
+.RS 12n
+The minimum allowed size in bytes that meta data buffers may consume in
+the ARC.  This value defaults to 0 which disables a floor on the amount
+of the ARC devoted meta data.
+.sp
+Default value: \fB0\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_arc_meta_prune\fR (int)
+.ad
+.RS 12n
+The number of dentries and inodes to be scanned looking for entries
+which can be dropped.  This may be required when the ARC reaches the
+\fBzfs_arc_meta_limit\fR because dentries and inodes can pin buffers
+in the ARC.  Increasing this value will cause to dentry and inode caches
+to be pruned more aggressively.  Setting this value to 0 will disable
+pruning the inode and dentry caches.
+.sp
+Default value: \fB10,000\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_arc_meta_adjust_restarts\fR (ulong)
+.ad
+.RS 12n
+The number of restart passes to make while scanning the ARC attempting
+the free buffers in order to stay below the \fBzfs_arc_meta_limit\fR.
+This value should not need to be tuned but is available to facilitate
+performance analysis.
+.sp
+Default value: \fB4096\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_arc_min\fR (ulong)
+.ad
+.RS 12n
+Min arc size
+.sp
+Default value: \fB100\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_arc_min_prefetch_lifespan\fR (int)
+.ad
+.RS 12n
+Min life of prefetch block
+.sp
+Default value: \fB100\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_arc_num_sublists_per_state\fR (int)
+.ad
+.RS 12n
+To allow more fine-grained locking, each ARC state contains a series
+of lists for both data and meta data objects.  Locking is performed at
+the level of these "sub-lists".  This parameters controls the number of
+sub-lists per ARC state.
+.sp
+Default value: 1 or the number of on-online CPUs, whichever is greater
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_arc_overflow_shift\fR (int)
+.ad
+.RS 12n
+The ARC size is considered to be overflowing if it exceeds the current
+ARC target size (arc_c) by a threshold determined by this parameter.
+The threshold is calculated as a fraction of arc_c using the formula
+"arc_c >> \fBzfs_arc_overflow_shift\fR".
+
+The default value of 8 causes the ARC to be considered to be overflowing
+if it exceeds the target size by 1/256th (0.3%) of the target size.
+
+When the ARC is overflowing, new buffer allocations are stalled until
+the reclaim thread catches up and the overflow condition no longer exists.
+.sp
+Default value: \fB8\fR.
+.RE
+
+.sp
+.ne 2
+.na
+
+\fBzfs_arc_p_min_shift\fR (int)
+.ad
+.RS 12n
+arc_c shift to calc min/max arc_p
+.sp
+Default value: \fB4\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_arc_p_aggressive_disable\fR (int)
+.ad
+.RS 12n
+Disable aggressive arc_p growth
+.sp
+Use \fB1\fR for yes (default) and \fB0\fR to disable.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_arc_p_dampener_disable\fR (int)
+.ad
+.RS 12n
+Disable arc_p adapt dampener
+.sp
+Use \fB1\fR for yes (default) and \fB0\fR to disable.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_arc_shrink_shift\fR (int)
+.ad
+.RS 12n
+log2(fraction of arc to reclaim)
+.sp
+Default value: \fB5\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_arc_sys_free\fR (ulong)
+.ad
+.RS 12n
+The target number of bytes the ARC should leave as free memory on the system.
+Defaults to the larger of 1/64 of physical memory or 512K.  Setting this
+option to a non-zero value will override the default.
+.sp
+Default value: \fB0\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_autoimport_disable\fR (int)
+.ad
+.RS 12n
+Disable pool import at module load by ignoring the cache file (typically \fB/etc/zfs/zpool.cache\fR).
+.sp
+Use \fB1\fR for yes (default) and \fB0\fR for no.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_dbgmsg_enable\fR (int)
+.ad
+.RS 12n
+Internally ZFS keeps a small log to facilitate debugging.  By default the log
+is disabled, to enable it set this option to 1.  The contents of the log can
+be accessed by reading the /proc/spl/kstat/zfs/dbgmsg file.  Writing 0 to
+this proc file clears the log.
+.sp
+Default value: \fB0\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_dbgmsg_maxsize\fR (int)
+.ad
+.RS 12n
+The maximum size in bytes of the internal ZFS debug log.
+.sp
+Default value: \fB4M\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_dbuf_state_index\fR (int)
+.ad
+.RS 12n
+Calculate arc header index
+.sp
+Default value: \fB0\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_deadman_enabled\fR (int)
+.ad
+.RS 12n
+Enable deadman timer
+.sp
+Use \fB1\fR for yes (default) and \fB0\fR to disable.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_deadman_synctime_ms\fR (ulong)
+.ad
+.RS 12n
+Expiration time in milliseconds. This value has two meanings. First it is
+used to determine when the spa_deadman() logic should fire. By default the
+spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
+Secondly, the value determines if an I/O is considered "hung". Any I/O that
+has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
+in a zevent being logged.
+.sp
+Default value: \fB1,000,000\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_dedup_prefetch\fR (int)
+.ad
+.RS 12n
+Enable prefetching dedup-ed blks
+.sp
+Use \fB1\fR for yes and \fB0\fR to disable (default).
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_delay_min_dirty_percent\fR (int)
  .ad
  .RS 12n
-SPA config file
+Start to delay each transaction once there is this amount of dirty data,
+expressed as a percentage of \fBzfs_dirty_data_max\fR.
+This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
+See the section "ZFS TRANSACTION DELAY".
  .sp
-Default value: \fB/etc/zfs/zpool.cache\fR.
+Default value: \fB60\fR.
  .RE
  
  .sp
  .ne 2
  .na
-\fBzfetch_array_rd_sz\fR (ulong)
+\fBzfs_delay_scale\fR (int)
  .ad
  .RS 12n
-Number of bytes in a array_read
+This controls how quickly the transaction delay approaches infinity.
+Larger values cause longer delays for a given amount of dirty data.
  .sp
-Default value: \fB1,048,576\fR.
+For the smoothest delay, this value should be about 1 billion divided
+by the maximum number of operations per second.  This will smoothly
+handle between 10x and 1/10th this number.
+.sp
+See the section "ZFS TRANSACTION DELAY".
+.sp
+Note: \fBzfs_delay_scale\fR * \fBzfs_dirty_data_max\fR must be < 2^64.
+.sp
+Default value: \fB500,000\fR.
  .RE
  
  .sp
  .ne 2
  .na
-\fBzfetch_block_cap\fR (uint)
+\fBzfs_dirty_data_max\fR (int)
  .ad
  .RS 12n
-Max number of blocks to fetch at a time
+Determines the dirty space limit in bytes.  Once this limit is exceeded, new
+writes are halted until space frees up. This parameter takes precedence
+over \fBzfs_dirty_data_max_percent\fR.
+See the section "ZFS TRANSACTION DELAY".
  .sp
-Default value: \fB256\fR.
+Default value: 10 percent of all memory, capped at \fBzfs_dirty_data_max_max\fR.
  .RE
  
  .sp
  .ne 2
  .na
-\fBzfetch_max_streams\fR (uint)
+\fBzfs_dirty_data_max_max\fR (int)
  .ad
  .RS 12n
-Max number of streams per zfetch
+Maximum allowable value of \fBzfs_dirty_data_max\fR, expressed in bytes.
+This limit is only enforced at module load time, and will be ignored if
+\fBzfs_dirty_data_max\fR is later changed.  This parameter takes
+precedence over \fBzfs_dirty_data_max_max_percent\fR. See the section
+"ZFS TRANSACTION DELAY".
  .sp
-Default value: \fB8\fR.
+Default value: 25% of physical RAM.
  .RE
  
  .sp
  .ne 2
  .na
-\fBzfetch_min_sec_reap\fR (uint)
+\fBzfs_dirty_data_max_max_percent\fR (int)
  .ad
  .RS 12n
-Min time before stream reclaim
+Maximum allowable value of \fBzfs_dirty_data_max\fR, expressed as a
+percentage of physical RAM.  This limit is only enforced at module load
+time, and will be ignored if \fBzfs_dirty_data_max\fR is later changed.
+The parameter \fBzfs_dirty_data_max_max\fR takes precedence over this
+one. See the section "ZFS TRANSACTION DELAY".
  .sp
-Default value: \fB2\fR.
+Default value: 25
  .RE
  
  .sp
  .ne 2
  .na
-\fBzfs_arc_grow_retry\fR (int)
+\fBzfs_dirty_data_max_percent\fR (int)
  .ad
  .RS 12n
-Seconds before growing arc size
+Determines the dirty space limit, expressed as a percentage of all
+memory.  Once this limit is exceeded, new writes are halted until space frees
+up.  The parameter \fBzfs_dirty_data_max\fR takes precedence over this
+one.  See the section "ZFS TRANSACTION DELAY".
  .sp
-Default value: \fB5\fR.
+Default value: 10%, subject to \fBzfs_dirty_data_max_max\fR.
  .RE
  
  .sp
  .ne 2
  .na
-\fBzfs_arc_max\fR (ulong)
+\fBzfs_dirty_data_sync\fR (int)
  .ad
  .RS 12n
-Max arc size
+Start syncing out a transaction group if there is at least this much dirty data.
  .sp
-Default value: \fB0\fR.
+Default value: \fB67,108,864\fR.
  .RE
  
  .sp
  .ne 2
  .na
-\fBzfs_arc_memory_throttle_disable\fR (int)
+\fBzfs_free_max_blocks\fR (ulong)
  .ad
  .RS 12n
-Disable memory throttle
+Maximum number of blocks freed in a single txg.
  .sp
-Use \fB1\fR for yes (default) and \fB0\fR to disable.
+Default value: \fB100,000\fR.
  .RE
  
  .sp
  .ne 2
  .na
-\fBzfs_arc_meta_limit\fR (ulong)
+\fBzfs_vdev_async_read_max_active\fR (int)
  .ad
  .RS 12n
-Meta limit for arc size
+Maxium asynchronous read I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
  .sp
-Default value: \fB0\fR.
+Default value: \fB3\fR.
  .RE
  
  .sp
  .ne 2
  .na
-\fBzfs_arc_meta_prune\fR (int)
+\fBzfs_vdev_async_read_min_active\fR (int)
  .ad
  .RS 12n
-Bytes of meta data to prune
+Minimum asynchronous read I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
  .sp
-Default value: \fB1,048,576\fR.
+Default value: \fB1\fR.
  .RE
  
  .sp
  .ne 2
  .na
-\fBzfs_arc_min\fR (ulong)
+\fBzfs_vdev_async_write_active_max_dirty_percent\fR (int)
  .ad
  .RS 12n
-Min arc size
+When the pool has more than
+\fBzfs_vdev_async_write_active_max_dirty_percent\fR dirty data, use
+\fBzfs_vdev_async_write_max_active\fR to limit active async writes.  If
+the dirty data is between min and max, the active I/O limit is linearly
+interpolated. See the section "ZFS I/O SCHEDULER".
  .sp
-Default value: \fB100\fR.
+Default value: \fB60\fR.
  .RE
  
  .sp
  .ne 2
  .na
-\fBzfs_arc_min_prefetch_lifespan\fR (int)
+\fBzfs_vdev_async_write_active_min_dirty_percent\fR (int)
  .ad
  .RS 12n
-Min life of prefetch block
+When the pool has less than
+\fBzfs_vdev_async_write_active_min_dirty_percent\fR dirty data, use
+\fBzfs_vdev_async_write_min_active\fR to limit active async writes.  If
+the dirty data is between min and max, the active I/O limit is linearly
+interpolated. See the section "ZFS I/O SCHEDULER".
  .sp
-Default value: \fB100\fR.
+Default value: \fB30\fR.
  .RE
  
  .sp
  .ne 2
  .na
-\fBzfs_arc_p_min_shift\fR (int)
+\fBzfs_vdev_async_write_max_active\fR (int)
  .ad
  .RS 12n
-arc_c shift to calc min/max arc_p
+Maxium asynchronous write I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
  .sp
-Default value: \fB4\fR.
+Default value: \fB10\fR.
  .RE
  
  .sp
  .ne 2
  .na
-\fBzfs_arc_shrink_shift\fR (int)
+\fBzfs_vdev_async_write_min_active\fR (int)
  .ad
  .RS 12n
-log2(fraction of arc to reclaim)
+Minimum asynchronous write I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
  .sp
-Default value: \fB5\fR.
+Default value: \fB1\fR.
  .RE
  
  .sp
  .ne 2
  .na
-\fBzfs_autoimport_disable\fR (int)
+\fBzfs_vdev_max_active\fR (int)
  .ad
  .RS 12n
-Disable pool import at module load
+The maximum number of I/Os active to each device.  Ideally, this will be >=
+the sum of each queue's max_active.  It must be at least the sum of each
+queue's min_active.  See the section "ZFS I/O SCHEDULER".
  .sp
-Use \fB1\fR for yes and \fB0\fR for no (default).
+Default value: \fB1,000\fR.
  .RE
  
  .sp
  .ne 2
  .na
-\fBzfs_dbuf_state_index\fR (int)
+\fBzfs_vdev_scrub_max_active\fR (int)
  .ad
  .RS 12n
-Calculate arc header index
+Maxium scrub I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
  .sp
-Default value: \fB0\fR.
+Default value: \fB2\fR.
  .RE
  
  .sp
  .ne 2
  .na
-\fBzfs_deadman_enabled\fR (int)
+\fBzfs_vdev_scrub_min_active\fR (int)
  .ad
  .RS 12n
-Enable deadman timer
+Minimum scrub I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
  .sp
-Use \fB1\fR for yes (default) and \fB0\fR to disable.
+Default value: \fB1\fR.
  .RE
  
  .sp
  .ne 2
  .na
-\fBzfs_deadman_synctime\fR (ulong)
+\fBzfs_vdev_sync_read_max_active\fR (int)
  .ad
  .RS 12n
-Expire in units of zfs_txg_synctime_ms
+Maxium synchronous read I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
  .sp
-Default value: \fB1,000\fR.
+Default value: \fB10\fR.
  .RE
  
  .sp
  .ne 2
  .na
-\fBzfs_dedup_prefetch\fR (int)
+\fBzfs_vdev_sync_read_min_active\fR (int)
  .ad
  .RS 12n
-Enable prefetching dedup-ed blks
+Minimum synchronous read I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
  .sp
-Use \fB1\fR for yes (default) and \fB0\fR to disable.
+Default value: \fB10\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_vdev_sync_write_max_active\fR (int)
+.ad
+.RS 12n
+Maxium synchronous write I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
+.sp
+Default value: \fB10\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_vdev_sync_write_min_active\fR (int)
+.ad
+.RS 12n
+Minimum synchronous write I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
+.sp
+Default value: \fB10\fR.
  .RE
  
  .sp
@@ -376,15 +980,103 @@ Seconds to expire .zfs/snapshot
  Default value: \fB300\fR.
  .RE
  
+.sp
+.ne 2
+.na
+\fBzfs_admin_snapshot\fR (int)
+.ad
+.RS 12n
+Allow the creation, removal, or renaming of entries in the .zfs/snapshot
+directory to cause the creation, destruction, or renaming of snapshots.
+When enabled this functionality works both locally and over NFS exports
+which have the 'no_root_squash' option set. This functionality is disabled
+by default.
+.sp
+Use \fB1\fR for yes and \fB0\fR for no (default).
+.RE
+
  .sp
  .ne 2
  .na
  \fBzfs_flags\fR (int)
  .ad
  .RS 12n
-Set additional debugging flags
+Set additional debugging flags. The following flags may be bitwise-or'd
+together.
+.sp
+.TS
+box;
+rB lB
+lB lB
+r l.
+Value  Symbolic Name
+       Description
+_
+1      ZFS_DEBUG_DPRINTF
+       Enable dprintf entries in the debug log.
+_
+2      ZFS_DEBUG_DBUF_VERIFY *
+       Enable extra dbuf verifications.
+_
+4      ZFS_DEBUG_DNODE_VERIFY *
+       Enable extra dnode verifications.
+_
+8      ZFS_DEBUG_SNAPNAMES
+       Enable snapshot name verification.
+_
+16     ZFS_DEBUG_MODIFY
+       Check for illegally modified ARC buffers.
+_
+32     ZFS_DEBUG_SPA
+       Enable spa_dbgmsg entries in the debug log.
+_
+64     ZFS_DEBUG_ZIO_FREE
+       Enable verification of block frees.
+_
+128    ZFS_DEBUG_HISTOGRAM_VERIFY
+       Enable extra spacemap histogram verifications.
+.TE
+.sp
+* Requires debug build.
  .sp
-Default value: \fB1\fR.
+Default value: \fB0\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_free_leak_on_eio\fR (int)
+.ad
+.RS 12n
+If destroy encounters an EIO while reading metadata (e.g. indirect
+blocks), space referenced by the missing metadata can not be freed.
+Normally this causes the background destroy to become "stalled", as
+it is unable to make forward progress.  While in this stalled state,
+all remaining space to free from the error-encountering filesystem is
+"temporarily leaked".  Set this flag to cause it to ignore the EIO,
+permanently leak the space from indirect blocks that can not be read,
+and continue to free everything else that it can.
+
+The default, "stalling" behavior is useful if the storage partially
+fails (i.e. some but not all i/os fail), and then later recovers.  In
+this case, we will be able to continue pool operations while it is
+partially failed, and when it recovers, we can continue to free the
+space, with no leaks.  However, note that this case is actually
+fairly rare.
+
+Typically pools either (a) fail completely (but perhaps temporarily,
+e.g. a top-level vdev going offline), or (b) have localized,
+permanent errors (e.g. disk returns the wrong data due to bit flip or
+firmware bug).  In case (a), this setting does not matter because the
+pool will be suspended and the sync thread will not be able to make
+forward progress regardless.  In case (b), because the error is
+permanent, the best we can do is leak the minimum amount of space,
+which is what setting this flag will do.  Therefore, it is reasonable
+for this flag to normally be set, but we chose the more conservative
+approach of not setting it, so that there is no possibility of
+leaking space in the "partial temporary" failure case.
+.sp
+Default value: \fB0\fR.
  .RE
  
  .sp
@@ -409,6 +1101,24 @@ Largest data block to write to zil
  Default value: \fB32,768\fR.
  .RE
  
+.sp
+.ne 2
+.na
+\fBzfs_max_recordsize\fR (int)
+.ad
+.RS 12n
+We currently support block sizes from 512 bytes to 16MB.  The benefits of
+larger blocks, and thus larger IO, need to be weighed against the cost of
+COWing a giant block to modify one byte.  Additionally, very large blocks
+can have an impact on i/o latency, and also potentially on the memory
+allocator.  Therefore, we do not allow the recordsize to be set larger than
+zfs_max_recordsize (default 1MB).  Larger blocks can be created by changing
+this tunable, and pools with larger blocks can always be imported and used,
+regardless of this setting.
+.sp
+Default value: \fB1,048,576\fR.
+.RE
+
  .sp
  .ne 2
  .na
@@ -423,21 +1133,65 @@ Use \fB1\fR for yes and \fB0\fR for no (default).
  .sp
  .ne 2
  .na
-\fBzfs_no_scrub_io\fR (int)
+\fBzfs_metaslab_fragmentation_threshold\fR (int)
  .ad
  .RS 12n
-Set for no scrub I/O
+Allow metaslabs to keep their active state as long as their fragmentation
+percentage is less than or equal to this value. An active metaslab that
+exceeds this threshold will no longer keep its active status allowing
+better metaslabs to be selected.
  .sp
-Use \fB1\fR for yes and \fB0\fR for no (default).
+Default value: \fB70\fR.
  .RE
  
  .sp
  .ne 2
  .na
-\fBzfs_no_scrub_prefetch\fR (int)
+\fBzfs_mg_fragmentation_threshold\fR (int)
  .ad
  .RS 12n
-Set for no scrub prefetching
+Metaslab groups are considered eligible for allocations if their
+fragmenation metric (measured as a percentage) is less than or equal to
+this value. If a metaslab group exceeds this threshold then it will be
+skipped unless all metaslab groups within the metaslab class have also
+crossed this threshold.
+.sp
+Default value: \fB85\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_mg_noalloc_threshold\fR (int)
+.ad
+.RS 12n
+Defines a threshold at which metaslab groups should be eligible for
+allocations.  The value is expressed as a percentage of free space
+beyond which a metaslab group is always eligible for allocations.
+If a metaslab group's free space is less than or equal to the
+threshold, the allocator will avoid allocating to that group
+unless all groups in the pool have reached the threshold.  Once all
+groups have reached the threshold, all groups are allowed to accept
+allocations.  The default value of 0 disables the feature and causes
+all metaslab groups to be eligible for allocations.
+
+This parameter allows to deal with pools having heavily imbalanced
+vdevs such as would be the case when a new vdev has been added.
+Setting the threshold to a non-zero percentage will stop allocations
+from being made to vdevs that aren't filled to the specified percentage
+and allow lesser filled vdevs to acquire more allocations than they
+otherwise would under the old \fBzfs_mg_alloc_failures\fR facility.
+.sp
+Default value: \fB0\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_no_scrub_io\fR (int)
+.ad
+.RS 12n
+Set for no scrub I/O
  .sp
  Use \fB1\fR for yes and \fB0\fR for no (default).
  .RE
@@ -445,10 +1199,10 @@ Use \fB1\fR for yes and \fB0\fR for no (default).
  .sp
  .ne 2
  .na
-\fBzfs_no_write_throttle\fR (int)
+\fBzfs_no_scrub_prefetch\fR (int)
  .ad
  .RS 12n
-Disable write throttling
+Set for no scrub prefetching
  .sp
  Use \fB1\fR for yes and \fB0\fR for no (default).
  .RE
@@ -478,12 +1232,12 @@ Use \fB1\fR for yes (default) and \fB0\fR to disable.
  .sp
  .ne 2
  .na
-\fBzfs_pd_blks_max\fR (int)
+\fBzfs_pd_bytes_max\fR (int)
  .ad
  .RS 12n
-Max number of blocks to prefetch
+The number of bytes which should be prefetched.
  .sp
-Default value: \fB100\fR.
+Default value: \fB52,428,800\fR.
  .RE
  
  .sp
@@ -548,7 +1302,9 @@ Use \fB1\fR for yes and \fB0\fR for no (default).
  \fBzfs_resilver_delay\fR (int)
  .ad
  .RS 12n
-Number of ticks to delay resilver
+Number of ticks to delay prior to issuing a resilver I/O operation when
+a non-resilver or non-scrub I/O operation has occurred within the past
+\fBzfs_scan_idle\fR ticks.
  .sp
  Default value: \fB2\fR.
  .RE
@@ -570,7 +1326,10 @@ Default value: \fB3,000\fR.
  \fBzfs_scan_idle\fR (int)
  .ad
  .RS 12n
-Idle window in clock ticks
+Idle window in clock ticks.  During a scrub or a resilver, if
+a non-scrub or non-resilver I/O operation has occurred during this
+window, the next scrub or resilver operation is delayed by, respectively
+\fBzfs_scrub_delay\fR or \fBzfs_resilver_delay\fR ticks.
  .sp
  Default value: \fB50\fR.
  .RE
@@ -592,11 +1351,24 @@ Default value: \fB1,000\fR.
  \fBzfs_scrub_delay\fR (int)
  .ad
  .RS 12n
-Number of ticks to delay scrub
+Number of ticks to delay prior to issuing a scrub I/O operation when
+a non-scrub or non-resilver I/O operation has occurred within the past
+\fBzfs_scan_idle\fR ticks.
  .sp
  Default value: \fB4\fR.
  .RE
  
+.sp
+.ne 2
+.na
+\fBzfs_send_corrupt_data\fR (int)
+.ad
+.RS 12n
+Allow to send corrupt data (ignore read/checksum errors when sending data)
+.sp
+Use \fB1\fR for yes and \fB0\fR for no (default).
+.RE
+
  .sp
  .ne 2
  .na
@@ -636,7 +1408,7 @@ Default value: \fB2\fR.
  \fBzfs_top_maxinflight\fR (int)
  .ad
  .RS 12n
-Max I/Os per top-level
+Max I/Os per top-level vdev during scrub or resilver operations.
  .sp
  Default value: \fB32\fR.
  .RE
@@ -652,17 +1424,6 @@ Historic statistics for the last N txgs
  Default value: \fB0\fR.
  .RE
  
-.sp
-.ne 2
-.na
-\fBzfs_txg_synctime_ms\fR (int)
-.ad
-.RS 12n
-Target milliseconds between txg sync
-.sp
-Default value: \fB1,000\fR.
-.RE
-
  .sp
  .ne 2
  .na
@@ -716,28 +1477,6 @@ Total size of the per-disk cache
  Default value: \fB0\fR.
  .RE
  
-.sp
-.ne 2
-.na
-\fBzfs_vdev_max_pending\fR (int)
-.ad
-.RS 12n
-Max pending per-vdev I/Os
-.sp
-Default value: \fB10\fR.
-.RE
-
-.sp
-.ne 2
-.na
-\fBzfs_vdev_min_pending\fR (int)
-.ad
-.RS 12n
-Min pending per-vdev I/Os
-.sp
-Default value: \fB4\fR.
-.RE
-
  .sp
  .ne 2
  .na
@@ -749,17 +1488,6 @@ Switch mirrors every N usecs
  Default value: \fB10,000\fR.
  .RE
  
-.sp
-.ne 2
-.na
-\fBzfs_vdev_ramp_rate\fR (int)
-.ad
-.RS 12n
-Exponential I/O issue ramp-up rate
-.sp
-Default value: \fB2\fR.
-.RE
-
  .sp
  .ne 2
  .na
@@ -782,17 +1510,6 @@ I/O scheduler
  Default value: \fBnoop\fR.
  .RE
  
-.sp
-.ne 2
-.na
-\fBzfs_vdev_time_shift\fR (int)
-.ad
-.RS 12n
-Deadline time shift for vdev I/O
-.sp
-Default value: \fB29\fR (each bucket is 0.537 seconds).
-.RE
-
  .sp
  .ne 2
  .na
@@ -804,61 +1521,6 @@ Aggregate write I/O over gap
  Default value: \fB4,096\fR.
  .RE
  
-.sp
-.ne 2
-.na
-\fBzfs_write_limit_inflated\fR (ulong)
-.ad
-.RS 12n
-Inflated txg write limit
-.sp
-Default value: \fB0\fR.
-.RE
-
-.sp
-.ne 2
-.na
-\fBzfs_write_limit_max\fR (ulong)
-.ad
-.RS 12n
-Max txg write limit
-.sp
-Default value: \fB0\fR.
-.RE
-
-.sp
-.ne 2
-.na
-\fBzfs_write_limit_min\fR (ulong)
-.ad
-.RS 12n
-Min txg write limit
-.sp
-Default value: \fB33,554,432\fR.
-.RE
-
-.sp
-.ne 2
-.na
-\fBzfs_write_limit_override\fR (ulong)
-.ad
-.RS 12n
-Override txg write limit
-.sp
-Default value: \fB0\fR.
-.RE
-
-.sp
-.ne 2
-.na
-\fBzfs_write_limit_shift\fR (int)
-.ad
-.RS 12n
-log2(fraction of memory) per txg
-.sp
-Default value: \fB3\fR.
-.RE
-
  .sp
  .ne 2
  .na
@@ -914,24 +1576,13 @@ Max commit bytes to separate log device
  Default value: \fB1,048,576\fR.
  .RE
  
-.sp
-.ne 2
-.na
-\fBzio_bulk_flags\fR (int)
-.ad
-.RS 12n
-Additional flags to pass to bulk buffers
-.sp
-Default value: \fB0\fR.
-.RE
-
  .sp
  .ne 2
  .na
  \fBzio_delay_max\fR (int)
  .ad
  .RS 12n
-Max zio millisec delay before posting event
+Max zio millisecond delay before posting event
  .sp
  Default value: \fB30,000\fR.
  .RE
@@ -939,23 +1590,29 @@ Default value: \fB30,000\fR.
  .sp
  .ne 2
  .na
-\fBzio_injection_enabled\fR (int)
+\fBzio_requeue_io_start_cut_in_line\fR (int)
  .ad
  .RS 12n
-Enable fault injection
+Prioritize requeued I/O
  .sp
-Use \fB1\fR for yes and \fB0\fR for no (default).
+Default value: \fB0\fR.
  .RE
  
  .sp
  .ne 2
  .na
-\fBzio_requeue_io_start_cut_in_line\fR (int)
+\fBzio_taskq_batch_pct\fR (uint)
  .ad
  .RS 12n
-Prioritize requeued I/O
+Percentage of online CPUs (or CPU cores, etc) which will run a worker thread
+for IO. These workers are responsible for IO work such as compression and
+checksum calculations. Fractional number of CPUs will be rounded down.
  .sp
-Default value: \fB0\fR.
+The default value of 75 was chosen to avoid using all CPUs which can result in
+latency issues and inconsistent application performance, especially when high
+compression is enabled.
+.sp
+Default value: \fB75\fR.
  .RE
  
  .sp
@@ -994,11 +1651,198 @@ Default value: \fB16,384\fR.
  .sp
  .ne 2
  .na
-\fBzvol_threads\fR (uint)
+\fBzvol_prefetch_bytes\fR (uint)
  .ad
  .RS 12n
-Number of threads for zvol device
+When adding a zvol to the system prefetch \fBzvol_prefetch_bytes\fR
+from the start and end of the volume.  Prefetching these regions
+of the volume is desirable because they are likely to be accessed
+immediately by \fBblkid(8)\fR or by the kernel scanning for a partition
+table.
  .sp
-Default value: \fB32\fR.
+Default value: \fB131,072\fR.
  .RE
  
+.SH ZFS I/O SCHEDULER
+ZFS issues I/O operations to leaf vdevs to satisfy and complete I/Os.
+The I/O scheduler determines when and in what order those operations are
+issued.  The I/O scheduler divides operations into five I/O classes
+prioritized in the following order: sync read, sync write, async read,
+async write, and scrub/resilver.  Each queue defines the minimum and
+maximum number of concurrent operations that may be issued to the
+device.  In addition, the device has an aggregate maximum,
+\fBzfs_vdev_max_active\fR. Note that the sum of the per-queue minimums
+must not exceed the aggregate maximum.  If the sum of the per-queue
+maximums exceeds the aggregate maximum, then the number of active I/Os
+may reach \fBzfs_vdev_max_active\fR, in which case no further I/Os will
+be issued regardless of whether all per-queue minimums have been met.
+.sp
+For many physical devices, throughput increases with the number of
+concurrent operations, but latency typically suffers. Further, physical
+devices typically have a limit at which more concurrent operations have no
+effect on throughput or can actually cause it to decrease.
+.sp
+The scheduler selects the next operation to issue by first looking for an
+I/O class whose minimum has not been satisfied. Once all are satisfied and
+the aggregate maximum has not been hit, the scheduler looks for classes
+whose maximum has not been satisfied. Iteration through the I/O classes is
+done in the order specified above. No further operations are issued if the
+aggregate maximum number of concurrent operations has been hit or if there
+are no operations queued for an I/O class that has not hit its maximum.
+Every time an I/O is queued or an operation completes, the I/O scheduler
+looks for new operations to issue.
+.sp
+In general, smaller max_active's will lead to lower latency of synchronous
+operations.  Larger max_active's may lead to higher overall throughput,
+depending on underlying storage.
+.sp
+The ratio of the queues' max_actives determines the balance of performance
+between reads, writes, and scrubs.  E.g., increasing
+\fBzfs_vdev_scrub_max_active\fR will cause the scrub or resilver to complete
+more quickly, but reads and writes to have higher latency and lower throughput.
+.sp
+All I/O classes have a fixed maximum number of outstanding operations
+except for the async write class. Asynchronous writes represent the data
+that is committed to stable storage during the syncing stage for
+transaction groups. Transaction groups enter the syncing state
+periodically so the number of queued async writes will quickly burst up
+and then bleed down to zero. Rather than servicing them as quickly as
+possible, the I/O scheduler changes the maximum number of active async
+write I/Os according to the amount of dirty data in the pool.  Since
+both throughput and latency typically increase with the number of
+concurrent operations issued to physical devices, reducing the
+burstiness in the number of concurrent operations also stabilizes the
+response time of operations from other -- and in particular synchronous
+-- queues. In broad strokes, the I/O scheduler will issue more
+concurrent operations from the async write queue as there's more dirty
+data in the pool.
+.sp
+Async Writes
+.sp
+The number of concurrent operations issued for the async write I/O class
+follows a piece-wise linear function defined by a few adjustable points.
+.nf
+
+       |              o---------| <-- zfs_vdev_async_write_max_active
+  ^    |             /^         |
+  |    |            / |         |
+active |           /  |         |
+ I/O   |          /   |         |
+count  |         /    |         |
+       |        /     |         |
+       |-------o      |         | <-- zfs_vdev_async_write_min_active
+      0|_______^______|_________|
+       0%      |      |       100% of zfs_dirty_data_max
+               |      |
+               |      `-- zfs_vdev_async_write_active_max_dirty_percent
+               `--------- zfs_vdev_async_write_active_min_dirty_percent
+
+.fi
+Until the amount of dirty data exceeds a minimum percentage of the dirty
+data allowed in the pool, the I/O scheduler will limit the number of
+concurrent operations to the minimum. As that threshold is crossed, the
+number of concurrent operations issued increases linearly to the maximum at
+the specified maximum percentage of the dirty data allowed in the pool.
+.sp
+Ideally, the amount of dirty data on a busy pool will stay in the sloped
+part of the function between \fBzfs_vdev_async_write_active_min_dirty_percent\fR
+and \fBzfs_vdev_async_write_active_max_dirty_percent\fR. If it exceeds the
+maximum percentage, this indicates that the rate of incoming data is
+greater than the rate that the backend storage can handle. In this case, we
+must further throttle incoming writes, as described in the next section.
+
+.SH ZFS TRANSACTION DELAY
+We delay transactions when we've determined that the backend storage
+isn't able to accommodate the rate of incoming writes.
+.sp
+If there is already a transaction waiting, we delay relative to when
+that transaction will finish waiting.  This way the calculated delay time
+is independent of the number of threads concurrently executing
+transactions.
+.sp
+If we are the only waiter, wait relative to when the transaction
+started, rather than the current time.  This credits the transaction for
+"time already served", e.g. reading indirect blocks.
+.sp
+The minimum time for a transaction to take is calculated as:
+.nf
+    min_time = zfs_delay_scale * (dirty - min) / (max - dirty)
+    min_time is then capped at 100 milliseconds.
+.fi
+.sp
+The delay has two degrees of freedom that can be adjusted via tunables.  The
+percentage of dirty data at which we start to delay is defined by
+\fBzfs_delay_min_dirty_percent\fR. This should typically be at or above
+\fBzfs_vdev_async_write_active_max_dirty_percent\fR so that we only start to
+delay after writing at full speed has failed to keep up with the incoming write
+rate. The scale of the curve is defined by \fBzfs_delay_scale\fR. Roughly speaking,
+this variable determines the amount of delay at the midpoint of the curve.
+.sp
+.nf
+delay
+ 10ms +-------------------------------------------------------------*+
+      |                                                             *|
+  9ms +                                                             *+
+      |                                                             *|
+  8ms +                                                             *+
+      |                                                            * |
+  7ms +                                                            * +
+      |                                                            * |
+  6ms +                                                            * +
+      |                                                            * |
+  5ms +                                                           *  +
+      |                                                           *  |
+  4ms +                                                           *  +
+      |                                                           *  |
+  3ms +                                                          *   +
+      |                                                          *   |
+  2ms +                                              (midpoint) *    +
+      |                                                  |    **     |
+  1ms +                                                  v ***       +
+      |             zfs_delay_scale ---------->     ********         |
+    0 +-------------------------------------*********----------------+
+      0%                    <- zfs_dirty_data_max ->               100%
+.fi
+.sp
+Note that since the delay is added to the outstanding time remaining on the
+most recent transaction, the delay is effectively the inverse of IOPS.
+Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
+was chosen such that small changes in the amount of accumulated dirty data
+in the first 3/4 of the curve yield relatively small differences in the
+amount of delay.
+.sp
+The effects can be easier to understand when the amount of delay is
+represented on a log scale:
+.sp
+.nf
+delay
+100ms +-------------------------------------------------------------++
+      +                                                              +
+      |                                                              |
+      +                                                             *+
+ 10ms +                                                             *+
+      +                                                           ** +
+      |                                              (midpoint)  **  |
+      +                                                  |     **    +
+  1ms +                                                  v ****      +
+      +             zfs_delay_scale ---------->        *****         +
+      |                                             ****             |
+      +                                          ****                +
+100us +                                        **                    +
+      +                                       *                      +
+      |                                      *                       |
+      +                                     *                        +
+ 10us +                                     *                        +
+      +                                                              +
+      |                                                              |
+      +                                                              +
+      +--------------------------------------------------------------+
+      0%                    <- zfs_dirty_data_max ->               100%
+.fi
+.sp
+Note here that only as the amount of dirty data approaches its limit does
+the delay start to increase rapidly. The goal of a properly tuned system
+should be to keep the amount of dirty data out of that range by first
+ensuring that the appropriate limits are set for the I/O scheduler to reach
+optimal throughput on the backend storage, and then by changing the value
+of \fBzfs_delay_scale\fR to increase the steepness of the curve.