#include <sys/txg.h>
#include <sys/spa_impl.h>
#include <sys/vdev_impl.h>
+#include <sys/vdev_trim.h>
#include <sys/zio_impl.h>
#include <sys/zio_compress.h>
#include <sys/zio_checksum.h>
* Note: Linux kernel thread name length is limited
* so these names will differ from upstream open zfs.
*/
- "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl"
+ "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl", "z_trim"
};
int zio_dva_throttle_enabled = B_TRUE;
+int zio_deadman_log_all = B_FALSE;
/*
* ==========================================================================
uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
#endif
-int zio_delay_max = ZIO_DELAY_MAX;
+/* Mark IOs as "slow" if they take longer than 30 seconds */
+int zio_slow_io_ms = (30 * MILLISEC);
#define BP_SPANB(indblkshift, level) \
(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
{
zio_t *zio;
- ASSERT3U(psize, <=, SPA_MAXBLOCKSIZE);
+ IMPLY(type != ZIO_TYPE_TRIM, psize <= SPA_MAXBLOCKSIZE);
ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0);
ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
return (zio);
}
+zio_t *
+zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+ zio_done_func_t *done, void *private, zio_priority_t priority,
+ enum zio_flag flags, enum trim_flag trim_flags)
+{
+ zio_t *zio;
+
+ ASSERT0(vd->vdev_children);
+ ASSERT0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
+ ASSERT0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+ ASSERT3U(size, !=, 0);
+
+ zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, size, size, done,
+ private, ZIO_TYPE_TRIM, priority, flags | ZIO_FLAG_PHYSICAL,
+ vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_TRIM_PIPELINE);
+ zio->io_trim_flags = trim_flags;
+
+ return (zio);
+}
+
zio_t *
zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
abd_t *data, int checksum, zio_done_func_t *done, void *private,
* If this is a high priority I/O, then use the high priority taskq if
* available.
*/
- if (zio->io_priority == ZIO_PRIORITY_NOW &&
+ if ((zio->io_priority == ZIO_PRIORITY_NOW ||
+ zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) &&
spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
q++;
if (NSEC_TO_TICK(diff) == 0) {
/* Our delay is less than a jiffy - just spin */
zfs_sleep_until(zio->io_target_timestamp);
+ zio_interrupt(zio);
} else {
/*
* Use taskq_dispatch_delay() in the place of
}
static void
-zio_deadman_impl(zio_t *pio)
+zio_deadman_impl(zio_t *pio, int ziodepth)
{
zio_t *cio, *cio_next;
zio_link_t *zl = NULL;
vdev_t *vd = pio->io_vd;
- if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
- vdev_queue_t *vq = &vd->vdev_queue;
+ if (zio_deadman_log_all || (vd != NULL && vd->vdev_ops->vdev_op_leaf)) {
+ vdev_queue_t *vq = vd ? &vd->vdev_queue : NULL;
zbookmark_phys_t *zb = &pio->io_bookmark;
uint64_t delta = gethrtime() - pio->io_timestamp;
uint64_t failmode = spa_get_deadman_failmode(pio->io_spa);
- zfs_dbgmsg("slow zio: zio=%p timestamp=%llu "
+ zfs_dbgmsg("slow zio[%d]: zio=%px timestamp=%llu "
"delta=%llu queued=%llu io=%llu "
"path=%s last=%llu "
"type=%d priority=%d flags=0x%x "
"stage=0x%x pipeline=0x%x pipeline-trace=0x%x "
"objset=%llu object=%llu level=%llu blkid=%llu "
"offset=%llu size=%llu error=%d",
- pio, pio->io_timestamp,
+ ziodepth, pio, pio->io_timestamp,
delta, pio->io_delta, pio->io_delay,
- vd->vdev_path, vq->vq_io_complete_ts,
+ vd ? vd->vdev_path : "NULL", vq ? vq->vq_io_complete_ts : 0,
pio->io_type, pio->io_priority, pio->io_flags,
- pio->io_state, pio->io_pipeline, pio->io_pipeline_trace,
+ pio->io_stage, pio->io_pipeline, pio->io_pipeline_trace,
zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
pio->io_offset, pio->io_size, pio->io_error);
zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN,
mutex_enter(&pio->io_lock);
for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
cio_next = zio_walk_children(pio, &zl);
- zio_deadman_impl(cio);
+ zio_deadman_impl(cio, ziodepth + 1);
}
mutex_exit(&pio->io_lock);
}
if (!zfs_deadman_enabled || spa_suspended(spa))
return;
- zio_deadman_impl(pio);
+ zio_deadman_impl(pio, 0);
switch (spa_get_deadman_failmode(spa)) {
case ZIO_FAILURE_MODE_WAIT:
}
if (error != 0) {
- zfs_dbgmsg("%s: metaslab allocation failure: zio %p, "
+ zfs_dbgmsg("%s: metaslab allocation failure: zio %px, "
"size %llu, error %d", spa_name(spa), zio, zio->io_size,
error);
if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
* ==========================================================================
*/
-
/*
* Issue an I/O to the underlying vdev. Typically the issue pipeline
* stops after this stage and will resume upon I/O completion.
return (zio);
}
- if (vd->vdev_ops->vdev_op_leaf &&
- (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
+ if (vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ ||
+ zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) {
if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
return (zio);
return (NULL);
}
- ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+ ASSERT(zio->io_type == ZIO_TYPE_READ ||
+ zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM);
if (zio->io_delay)
zio->io_delay = gethrtime() - zio->io_delay;
if (zio_injection_enabled && zio->io_error == 0)
zio->io_error = zio_handle_label_injection(zio, EIO);
- if (zio->io_error) {
+ if (zio->io_error && zio->io_type != ZIO_TYPE_TRIM) {
if (!vdev_accessible(vd, zio)) {
zio->io_error = SET_ERROR(ENXIO);
} else {
/*
* If a cache flush returns ENOTSUP or ENOTTY, we know that no future
- * attempts will ever succeed. In this case we set a persistent bit so
- * that we don't bother with it in the future.
+ * attempts will ever succeed. In this case we set a persistent
+ * boolean flag so that we don't bother with it in the future.
*/
if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
zio->io_type == ZIO_TYPE_IOCTL &&
/*
* Later passes of sync-to-convergence may decide to rewrite data
* in place to avoid more disk reallocations. This presents a problem
- * for encryption because this consitutes rewriting the new data with
+ * for encryption because this constitutes rewriting the new data with
* the same encryption key and IV. However, this only applies to blocks
* in the MOS (particularly the spacemaps) and we do not encrypt the
* MOS. We assert that the zio is allocating or an intent log write
zio->io_error = error;
if (error == ECKSUM &&
!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ mutex_enter(&zio->io_vd->vdev_stat_lock);
+ zio->io_vd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&zio->io_vd->vdev_stat_lock);
+
zfs_ereport_start_checksum(zio->io_spa,
zio->io_vd, &zio->io_bookmark, zio,
zio->io_offset, zio->io_size, NULL, &info);
* ==========================================================================
* Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
* An error of 0 indicates success. ENXIO indicates whole-device failure,
- * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO
+ * which may be transient (e.g. unplugged) or permanent. ECKSUM and EIO
* indicate errors that are specific to one I/O, and most likely permanent.
* Any other error is presumed to be worse because we weren't expecting it.
* ==========================================================================
{
/*
* Always attempt to keep stack usage minimal here since
- * we can be called recurisvely up to 19 levels deep.
+ * we can be called recursively up to 19 levels deep.
*/
const uint64_t psize = zio->io_size;
zio_t *pio, *pio_next;
* 30 seconds to complete, post an error described the I/O delay.
* We ignore these errors if the device is currently unavailable.
*/
- if (zio->io_delay >= MSEC2NSEC(zio_delay_max)) {
- if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
- zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
- zio->io_vd, &zio->io_bookmark, zio, 0, 0);
+ if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) {
+ if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) {
+ /*
+ * We want to only increment our slow IO counters if
+ * the IO is valid (i.e. not if the drive is removed).
+ *
+ * zfs_ereport_post() will also do these checks, but
+ * it can also ratelimit and have other failures, so we
+ * need to increment the slow_io counters independent
+ * of it.
+ */
+ if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY,
+ zio->io_spa, zio->io_vd, zio)) {
+ mutex_enter(&zio->io_vd->vdev_stat_lock);
+ zio->io_vd->vdev_stat.vs_slow_ios++;
+ mutex_exit(&zio->io_vd->vdev_stat_lock);
+
+ zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
+ zio->io_spa, zio->io_vd, &zio->io_bookmark,
+ zio, 0, 0);
+ }
+ }
}
if (zio->io_error) {
* device is currently unavailable.
*/
if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
- !vdev_is_dead(zio->io_vd))
+ !vdev_is_dead(zio->io_vd)) {
+ mutex_enter(&zio->io_vd->vdev_stat_lock);
+ if (zio->io_type == ZIO_TYPE_READ) {
+ zio->io_vd->vdev_stat.vs_read_errors++;
+ } else if (zio->io_type == ZIO_TYPE_WRITE) {
+ zio->io_vd->vdev_stat.vs_write_errors++;
+ }
+ mutex_exit(&zio->io_vd->vdev_stat_lock);
+
zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa,
zio->io_vd, &zio->io_bookmark, zio, 0, 0);
+ }
if ((zio->io_error == EIO || !(zio->io_flags &
(ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
EXPORT_SYMBOL(zio_buf_free);
EXPORT_SYMBOL(zio_data_buf_free);
-module_param(zio_delay_max, int, 0644);
-MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event");
+module_param(zio_slow_io_ms, int, 0644);
+MODULE_PARM_DESC(zio_slow_io_ms,
+ "Max I/O completion time (milliseconds) before marking it as slow");
module_param(zio_requeue_io_start_cut_in_line, int, 0644);
MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O");
module_param(zio_dva_throttle_enabled, int, 0644);
MODULE_PARM_DESC(zio_dva_throttle_enabled,
"Throttle block allocations in the ZIO pipeline");
+
+module_param(zio_deadman_log_all, int, 0644);
+MODULE_PARM_DESC(zio_deadman_log_all,
+ "Log all slow ZIOs, not just those with vdevs");
#endif