Restrict kstats and print real pointers

[mirror_zfs.git] / module / zfs / zio.c
diff --git a/module/zfs/zio.c b/module/zfs/zio.c

index e8c2ca89aff9cd3e855e4870e06c603bb00dc523..016ac07eabd92eed09bff77ba6f9bed37612cf7d 100644 (file)
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -32,6 +32,7 @@
  #include <sys/txg.h>
  #include <sys/spa_impl.h>
  #include <sys/vdev_impl.h>
+#include <sys/vdev_trim.h>
  #include <sys/zio_impl.h>
  #include <sys/zio_compress.h>
  #include <sys/zio_checksum.h>
@@ -58,10 +59,11 @@ const char *zio_type_name[ZIO_TYPES] = {
          * Note: Linux kernel thread name length is limited
          * so these names will differ from upstream open zfs.
          */
-       "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl"
+       "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl", "z_trim"
  };
  
  int zio_dva_throttle_enabled = B_TRUE;
+int zio_deadman_log_all = B_FALSE;
  
  /*
   * ==========================================================================
@@ -77,7 +79,8 @@ uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  #endif
  
-int zio_delay_max = ZIO_DELAY_MAX;
+/* Mark IOs as "slow" if they take longer than 30 seconds */
+int zio_slow_io_ms = (30 * MILLISEC);
  
  #define        BP_SPANB(indblkshift, level) \
         (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
@@ -759,7 +762,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
  {
         zio_t *zio;
  
-       ASSERT3U(psize, <=, SPA_MAXBLOCKSIZE);
+       IMPLY(type != ZIO_TYPE_TRIM, psize <= SPA_MAXBLOCKSIZE);
         ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0);
         ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
  
@@ -1209,6 +1212,26 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
         return (zio);
  }
  
+zio_t *
+zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+    zio_done_func_t *done, void *private, zio_priority_t priority,
+    enum zio_flag flags, enum trim_flag trim_flags)
+{
+       zio_t *zio;
+
+       ASSERT0(vd->vdev_children);
+       ASSERT0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
+       ASSERT0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+       ASSERT3U(size, !=, 0);
+
+       zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, size, size, done,
+           private, ZIO_TYPE_TRIM, priority, flags | ZIO_FLAG_PHYSICAL,
+           vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_TRIM_PIPELINE);
+       zio->io_trim_flags = trim_flags;
+
+       return (zio);
+}
+
  zio_t *
  zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
      abd_t *data, int checksum, zio_done_func_t *done, void *private,
@@ -1737,7 +1760,8 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
          * If this is a high priority I/O, then use the high priority taskq if
          * available.
          */
-       if (zio->io_priority == ZIO_PRIORITY_NOW &&
+       if ((zio->io_priority == ZIO_PRIORITY_NOW ||
+           zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) &&
             spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
                 q++;
  
@@ -1831,6 +1855,7 @@ zio_delay_interrupt(zio_t *zio)
                         if (NSEC_TO_TICK(diff) == 0) {
                                 /* Our delay is less than a jiffy - just spin */
                                 zfs_sleep_until(zio->io_target_timestamp);
+                               zio_interrupt(zio);
                         } else {
                                 /*
                                  * Use taskq_dispatch_delay() in the place of
@@ -1856,30 +1881,30 @@ zio_delay_interrupt(zio_t *zio)
  }
  
  static void
-zio_deadman_impl(zio_t *pio)
+zio_deadman_impl(zio_t *pio, int ziodepth)
  {
         zio_t *cio, *cio_next;
         zio_link_t *zl = NULL;
         vdev_t *vd = pio->io_vd;
  
-       if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
-               vdev_queue_t *vq = &vd->vdev_queue;
+       if (zio_deadman_log_all || (vd != NULL && vd->vdev_ops->vdev_op_leaf)) {
+               vdev_queue_t *vq = vd ? &vd->vdev_queue : NULL;
                 zbookmark_phys_t *zb = &pio->io_bookmark;
                 uint64_t delta = gethrtime() - pio->io_timestamp;
                 uint64_t failmode = spa_get_deadman_failmode(pio->io_spa);
  
-               zfs_dbgmsg("slow zio: zio=%p timestamp=%llu "
+               zfs_dbgmsg("slow zio[%d]: zio=%px timestamp=%llu "
                     "delta=%llu queued=%llu io=%llu "
                     "path=%s last=%llu "
                     "type=%d priority=%d flags=0x%x "
                     "stage=0x%x pipeline=0x%x pipeline-trace=0x%x "
                     "objset=%llu object=%llu level=%llu blkid=%llu "
                     "offset=%llu size=%llu error=%d",
-                   pio, pio->io_timestamp,
+                   ziodepth, pio, pio->io_timestamp,
                     delta, pio->io_delta, pio->io_delay,
-                   vd->vdev_path, vq->vq_io_complete_ts,
+                   vd ? vd->vdev_path : "NULL", vq ? vq->vq_io_complete_ts : 0,
                     pio->io_type, pio->io_priority, pio->io_flags,
-                   pio->io_state, pio->io_pipeline, pio->io_pipeline_trace,
+                   pio->io_stage, pio->io_pipeline, pio->io_pipeline_trace,
                     zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
                     pio->io_offset, pio->io_size, pio->io_error);
                 zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN,
@@ -1894,7 +1919,7 @@ zio_deadman_impl(zio_t *pio)
         mutex_enter(&pio->io_lock);
         for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
                 cio_next = zio_walk_children(pio, &zl);
-               zio_deadman_impl(cio);
+               zio_deadman_impl(cio, ziodepth + 1);
         }
         mutex_exit(&pio->io_lock);
  }
@@ -1912,7 +1937,7 @@ zio_deadman(zio_t *pio, char *tag)
         if (!zfs_deadman_enabled || spa_suspended(spa))
                 return;
  
-       zio_deadman_impl(pio);
+       zio_deadman_impl(pio, 0);
  
         switch (spa_get_deadman_failmode(spa)) {
         case ZIO_FAILURE_MODE_WAIT:
@@ -3419,7 +3444,7 @@ zio_dva_allocate(zio_t *zio)
         }
  
         if (error != 0) {
-               zfs_dbgmsg("%s: metaslab allocation failure: zio %p, "
+               zfs_dbgmsg("%s: metaslab allocation failure: zio %px, "
                     "size %llu, error %d", spa_name(spa), zio, zio->io_size,
                     error);
                 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
@@ -3558,7 +3583,6 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
   * ==========================================================================
   */
  
-
  /*
   * Issue an I/O to the underlying vdev. Typically the issue pipeline
   * stops after this stage and will resume upon I/O completion.
@@ -3681,8 +3705,8 @@ zio_vdev_io_start(zio_t *zio)
                 return (zio);
         }
  
-       if (vd->vdev_ops->vdev_op_leaf &&
-           (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
+       if (vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ ||
+           zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) {
  
                 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
                         return (zio);
@@ -3713,7 +3737,8 @@ zio_vdev_io_done(zio_t *zio)
                 return (NULL);
         }
  
-       ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+       ASSERT(zio->io_type == ZIO_TYPE_READ ||
+           zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM);
  
         if (zio->io_delay)
                 zio->io_delay = gethrtime() - zio->io_delay;
@@ -3732,7 +3757,7 @@ zio_vdev_io_done(zio_t *zio)
                 if (zio_injection_enabled && zio->io_error == 0)
                         zio->io_error = zio_handle_label_injection(zio, EIO);
  
-               if (zio->io_error) {
+               if (zio->io_error && zio->io_type != ZIO_TYPE_TRIM) {
                         if (!vdev_accessible(vd, zio)) {
                                 zio->io_error = SET_ERROR(ENXIO);
                         } else {
@@ -3862,8 +3887,8 @@ zio_vdev_io_assess(zio_t *zio)
  
         /*
          * If a cache flush returns ENOTSUP or ENOTTY, we know that no future
-        * attempts will ever succeed. In this case we set a persistent bit so
-        * that we don't bother with it in the future.
+        * attempts will ever succeed. In this case we set a persistent
+        * boolean flag so that we don't bother with it in the future.
          */
         if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
             zio->io_type == ZIO_TYPE_IOCTL &&
@@ -4014,7 +4039,7 @@ zio_encrypt(zio_t *zio)
         /*
          * Later passes of sync-to-convergence may decide to rewrite data
          * in place to avoid more disk reallocations. This presents a problem
-        * for encryption because this consitutes rewriting the new data with
+        * for encryption because this constitutes rewriting the new data with
          * the same encryption key and IV. However, this only applies to blocks
          * in the MOS (particularly the spacemaps) and we do not encrypt the
          * MOS. We assert that the zio is allocating or an intent log write
@@ -4128,6 +4153,10 @@ zio_checksum_verify(zio_t *zio)
                 zio->io_error = error;
                 if (error == ECKSUM &&
                     !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+                       mutex_enter(&zio->io_vd->vdev_stat_lock);
+                       zio->io_vd->vdev_stat.vs_checksum_errors++;
+                       mutex_exit(&zio->io_vd->vdev_stat_lock);
+
                         zfs_ereport_start_checksum(zio->io_spa,
                             zio->io_vd, &zio->io_bookmark, zio,
                             zio->io_offset, zio->io_size, NULL, &info);
@@ -4150,7 +4179,7 @@ zio_checksum_verified(zio_t *zio)
   * ==========================================================================
   * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
   * An error of 0 indicates success.  ENXIO indicates whole-device failure,
- * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
+ * which may be transient (e.g. unplugged) or permanent.  ECKSUM and EIO
   * indicate errors that are specific to one I/O, and most likely permanent.
   * Any other error is presumed to be worse because we weren't expecting it.
   * ==========================================================================
@@ -4321,7 +4350,7 @@ zio_done(zio_t *zio)
  {
         /*
          * Always attempt to keep stack usage minimal here since
-        * we can be called recurisvely up to 19 levels deep.
+        * we can be called recursively up to 19 levels deep.
          */
         const uint64_t psize = zio->io_size;
         zio_t *pio, *pio_next;
@@ -4431,10 +4460,28 @@ zio_done(zio_t *zio)
          * 30 seconds to complete, post an error described the I/O delay.
          * We ignore these errors if the device is currently unavailable.
          */
-       if (zio->io_delay >= MSEC2NSEC(zio_delay_max)) {
-               if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
-                       zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
-                           zio->io_vd, &zio->io_bookmark, zio, 0, 0);
+       if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) {
+               if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) {
+                       /*
+                        * We want to only increment our slow IO counters if
+                        * the IO is valid (i.e. not if the drive is removed).
+                        *
+                        * zfs_ereport_post() will also do these checks, but
+                        * it can also ratelimit and have other failures, so we
+                        * need to increment the slow_io counters independent
+                        * of it.
+                        */
+                       if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY,
+                           zio->io_spa, zio->io_vd, zio)) {
+                               mutex_enter(&zio->io_vd->vdev_stat_lock);
+                               zio->io_vd->vdev_stat.vs_slow_ios++;
+                               mutex_exit(&zio->io_vd->vdev_stat_lock);
+
+                               zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
+                                   zio->io_spa, zio->io_vd, &zio->io_bookmark,
+                                   zio, 0, 0);
+                       }
+               }
         }
  
         if (zio->io_error) {
@@ -4445,9 +4492,18 @@ zio_done(zio_t *zio)
                  * device is currently unavailable.
                  */
                 if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
-                   !vdev_is_dead(zio->io_vd))
+                   !vdev_is_dead(zio->io_vd)) {
+                       mutex_enter(&zio->io_vd->vdev_stat_lock);
+                       if (zio->io_type == ZIO_TYPE_READ) {
+                               zio->io_vd->vdev_stat.vs_read_errors++;
+                       } else if (zio->io_type == ZIO_TYPE_WRITE) {
+                               zio->io_vd->vdev_stat.vs_write_errors++;
+                       }
+                       mutex_exit(&zio->io_vd->vdev_stat_lock);
+
                         zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa,
                             zio->io_vd, &zio->io_bookmark, zio, 0, 0);
+               }
  
                 if ((zio->io_error == EIO || !(zio->io_flags &
                     (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
@@ -4823,8 +4879,9 @@ EXPORT_SYMBOL(zio_data_buf_alloc);
  EXPORT_SYMBOL(zio_buf_free);
  EXPORT_SYMBOL(zio_data_buf_free);
  
-module_param(zio_delay_max, int, 0644);
-MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event");
+module_param(zio_slow_io_ms, int, 0644);
+MODULE_PARM_DESC(zio_slow_io_ms,
+       "Max I/O completion time (milliseconds) before marking it as slow");
  
  module_param(zio_requeue_io_start_cut_in_line, int, 0644);
  MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O");
@@ -4844,4 +4901,8 @@ MODULE_PARM_DESC(zfs_sync_pass_rewrite,
  module_param(zio_dva_throttle_enabled, int, 0644);
  MODULE_PARM_DESC(zio_dva_throttle_enabled,
         "Throttle block allocations in the ZIO pipeline");
+
+module_param(zio_deadman_log_all, int, 0644);
+MODULE_PARM_DESC(zio_deadman_log_all,
+       "Log all slow ZIOs, not just those with vdevs");
  #endif