Distributed Spare (dRAID) Feature

[mirror_zfs.git] / module / zfs / vdev.c
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c

index e41e79ab8a188f61df56594d677852f38e3733ec..38f36e52fca66c80f092ea00251a47a70f887d7c 100644 (file)
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -40,6 +40,7 @@
  #include <sys/dsl_dir.h>
  #include <sys/vdev_impl.h>
  #include <sys/vdev_rebuild.h>
+#include <sys/vdev_draid.h>
  #include <sys/uberblock_impl.h>
  #include <sys/metaslab.h>
  #include <sys/metaslab_impl.h>
@@ -51,6 +52,7 @@
  #include <sys/arc.h>
  #include <sys/zil.h>
  #include <sys/dsl_scan.h>
+#include <sys/vdev_raidz.h>
  #include <sys/abd.h>
  #include <sys/vdev_initialize.h>
  #include <sys/vdev_trim.h>
@@ -193,6 +195,8 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
  static vdev_ops_t *vdev_ops_table[] = {
         &vdev_root_ops,
         &vdev_raidz_ops,
+       &vdev_draid_ops,
+       &vdev_draid_spare_ops,
         &vdev_mirror_ops,
         &vdev_replacing_ops,
         &vdev_spare_ops,
@@ -221,10 +225,11 @@ vdev_getops(const char *type)
  
  /* ARGSUSED */
  void
-vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, range_seg64_t *res)
+vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
+    range_seg64_t *physical_rs, range_seg64_t *remain_rs)
  {
-       res->rs_start = in->rs_start;
-       res->rs_end = in->rs_end;
+       physical_rs->rs_start = logical_rs->rs_start;
+       physical_rs->rs_end = logical_rs->rs_end;
  }
  
  /*
@@ -264,6 +269,12 @@ vdev_default_asize(vdev_t *vd, uint64_t psize)
         return (asize);
  }
  
+uint64_t
+vdev_default_min_asize(vdev_t *vd)
+{
+       return (vd->vdev_min_asize);
+}
+
  /*
   * Get the minimum allocatable size. We define the allocatable size as
   * the vdev's asize rounded to the nearest metaslab. This allows us to
@@ -289,15 +300,7 @@ vdev_get_min_asize(vdev_t *vd)
         if (vd == vd->vdev_top)
                 return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
  
-       /*
-        * The allocatable space for a raidz vdev is N * sizeof(smallest child),
-        * so each child must provide at least 1/Nth of its asize.
-        */
-       if (pvd->vdev_ops == &vdev_raidz_ops)
-               return ((pvd->vdev_min_asize + pvd->vdev_children - 1) /
-                   pvd->vdev_children);
-
-       return (pvd->vdev_min_asize);
+       return (pvd->vdev_ops->vdev_op_min_asize(pvd));
  }
  
  void
@@ -309,6 +312,48 @@ vdev_set_min_asize(vdev_t *vd)
                 vdev_set_min_asize(vd->vdev_child[c]);
  }
  
+/*
+ * Get the minimal allocation size for the top-level vdev.
+ */
+uint64_t
+vdev_get_min_alloc(vdev_t *vd)
+{
+       uint64_t min_alloc = 1ULL << vd->vdev_ashift;
+
+       if (vd->vdev_ops->vdev_op_min_alloc != NULL)
+               min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd);
+
+       return (min_alloc);
+}
+
+/*
+ * Get the parity level for a top-level vdev.
+ */
+uint64_t
+vdev_get_nparity(vdev_t *vd)
+{
+       uint64_t nparity = 0;
+
+       if (vd->vdev_ops->vdev_op_nparity != NULL)
+               nparity = vd->vdev_ops->vdev_op_nparity(vd);
+
+       return (nparity);
+}
+
+/*
+ * Get the number of data disks for a top-level vdev.
+ */
+uint64_t
+vdev_get_ndisks(vdev_t *vd)
+{
+       uint64_t ndisks = 1;
+
+       if (vd->vdev_ops->vdev_op_ndisks != NULL)
+               ndisks = vd->vdev_ops->vdev_op_ndisks(vd);
+
+       return (ndisks);
+}
+
  vdev_t *
  vdev_lookup_top(spa_t *spa, uint64_t vdev)
  {
@@ -551,6 +596,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
         list_link_init(&vd->vdev_initialize_node);
         list_link_init(&vd->vdev_leaf_node);
         list_link_init(&vd->vdev_trim_node);
+
         mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
         mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
         mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -569,9 +615,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
         cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
  
         mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
-       mutex_init(&vd->vdev_rebuild_io_lock, NULL, MUTEX_DEFAULT, NULL);
         cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
-       cv_init(&vd->vdev_rebuild_io_cv, NULL, CV_DEFAULT, NULL);
  
         for (int t = 0; t < DTL_TYPES; t++) {
                 vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
@@ -600,7 +644,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
  {
         vdev_ops_t *ops;
         char *type;
-       uint64_t guid = 0, islog, nparity;
+       uint64_t guid = 0, islog;
         vdev_t *vd;
         vdev_indirect_config_t *vic;
         char *tmp = NULL;
@@ -657,48 +701,13 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
         if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
                 return (SET_ERROR(ENOTSUP));
  
-       /*
-        * Set the nparity property for RAID-Z vdevs.
-        */
-       nparity = -1ULL;
-       if (ops == &vdev_raidz_ops) {
-               if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
-                   &nparity) == 0) {
-                       if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
-                               return (SET_ERROR(EINVAL));
-                       /*
-                        * Previous versions could only support 1 or 2 parity
-                        * device.
-                        */
-                       if (nparity > 1 &&
-                           spa_version(spa) < SPA_VERSION_RAIDZ2)
-                               return (SET_ERROR(ENOTSUP));
-                       if (nparity > 2 &&
-                           spa_version(spa) < SPA_VERSION_RAIDZ3)
-                               return (SET_ERROR(ENOTSUP));
-               } else {
-                       /*
-                        * We require the parity to be specified for SPAs that
-                        * support multiple parity levels.
-                        */
-                       if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
-                               return (SET_ERROR(EINVAL));
-                       /*
-                        * Otherwise, we default to 1 parity device for RAID-Z.
-                        */
-                       nparity = 1;
-               }
-       } else {
-               nparity = 0;
-       }
-       ASSERT(nparity != -1ULL);
-
-       /*
-        * If creating a top-level vdev, check for allocation classes input
-        */
         if (top_level && alloctype == VDEV_ALLOC_ADD) {
                 char *bias;
  
+               /*
+                * If creating a top-level vdev, check for allocation
+                * classes input.
+                */
                 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
                     &bias) == 0) {
                         alloc_bias = vdev_derive_alloc_bias(bias);
@@ -710,13 +719,32 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
                                 return (SET_ERROR(ENOTSUP));
                         }
                 }
+
+               /* spa_vdev_add() expects feature to be enabled */
+               if (ops == &vdev_draid_ops &&
+                   spa->spa_load_state != SPA_LOAD_CREATE &&
+                   !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) {
+                       return (SET_ERROR(ENOTSUP));
+               }
         }
  
-       vd = vdev_alloc_common(spa, id, guid, ops);
-       vic = &vd->vdev_indirect_config;
+       /*
+        * Initialize the vdev specific data.  This is done before calling
+        * vdev_alloc_common() since it may fail and this simplifies the
+        * error reporting and cleanup code paths.
+        */
+       void *tsd = NULL;
+       if (ops->vdev_op_init != NULL) {
+               rc = ops->vdev_op_init(spa, nv, &tsd);
+               if (rc != 0) {
+                       return (rc);
+               }
+       }
  
+       vd = vdev_alloc_common(spa, id, guid, ops);
+       vd->vdev_tsd = tsd;
         vd->vdev_islog = islog;
-       vd->vdev_nparity = nparity;
+
         if (top_level && alloc_bias != VDEV_BIAS_NONE)
                 vd->vdev_alloc_bias = alloc_bias;
  
@@ -756,6 +784,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
             &vd->vdev_wholedisk) != 0)
                 vd->vdev_wholedisk = -1ULL;
  
+       vic = &vd->vdev_indirect_config;
+
         ASSERT0(vic->vic_mapping_object);
         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
             &vic->vic_mapping_object);
@@ -937,6 +967,9 @@ vdev_free(vdev_t *vd)
         ASSERT(vd->vdev_child == NULL);
         ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
  
+       if (vd->vdev_ops->vdev_op_fini != NULL)
+               vd->vdev_ops->vdev_op_fini(vd);
+
         /*
          * Discard allocation state.
          */
@@ -1028,9 +1061,7 @@ vdev_free(vdev_t *vd)
         cv_destroy(&vd->vdev_trim_io_cv);
  
         mutex_destroy(&vd->vdev_rebuild_lock);
-       mutex_destroy(&vd->vdev_rebuild_io_lock);
         cv_destroy(&vd->vdev_rebuild_cv);
-       cv_destroy(&vd->vdev_rebuild_io_cv);
  
         zfs_ratelimit_fini(&vd->vdev_delay_rl);
         zfs_ratelimit_fini(&vd->vdev_checksum_rl);
@@ -1161,7 +1192,8 @@ vdev_top_update(vdev_t *tvd, vdev_t *vd)
  }
  
  /*
- * Add a mirror/replacing vdev above an existing vdev.
+ * Add a mirror/replacing vdev above an existing vdev.  There is no need to
+ * call .vdev_op_init() since mirror/replacing vdevs do not have private state.
   */
  vdev_t *
  vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
@@ -1296,6 +1328,10 @@ vdev_metaslab_group_create(vdev_t *vd)
                                 spa->spa_max_ashift = vd->vdev_ashift;
                         if (vd->vdev_ashift < spa->spa_min_ashift)
                                 spa->spa_min_ashift = vd->vdev_ashift;
+
+                       uint64_t min_alloc = vdev_get_min_alloc(vd);
+                       if (min_alloc < spa->spa_min_alloc)
+                               spa->spa_min_alloc = min_alloc;
                 }
         }
  }
@@ -1622,39 +1658,67 @@ vdev_uses_zvols(vdev_t *vd)
         return (B_FALSE);
  }
  
-void
-vdev_open_children(vdev_t *vd)
+/*
+ * Returns B_TRUE if the passed child should be opened.
+ */
+static boolean_t
+vdev_default_open_children_func(vdev_t *vd)
+{
+       return (B_TRUE);
+}
+
+/*
+ * Open the requested child vdevs.  If any of the leaf vdevs are using
+ * a ZFS volume then do the opens in a single thread.  This avoids a
+ * deadlock when the current thread is holding the spa_namespace_lock.
+ */
+static void
+vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func)
  {
-       taskq_t *tq;
         int children = vd->vdev_children;
  
-       /*
-        * in order to handle pools on top of zvols, do the opens
-        * in a single thread so that the same thread holds the
-        * spa_namespace_lock
-        */
-       if (vdev_uses_zvols(vd)) {
-retry_sync:
-               for (int c = 0; c < children; c++)
-                       vd->vdev_child[c]->vdev_open_error =
-                           vdev_open(vd->vdev_child[c]);
-       } else {
-               tq = taskq_create("vdev_open", children, minclsyspri,
-                   children, children, TASKQ_PREPOPULATE);
-               if (tq == NULL)
-                       goto retry_sync;
+       taskq_t *tq = taskq_create("vdev_open", children, minclsyspri,
+           children, children, TASKQ_PREPOPULATE);
+       vd->vdev_nonrot = B_TRUE;
  
-               for (int c = 0; c < children; c++)
+       for (int c = 0; c < children; c++) {
+               vdev_t *cvd = vd->vdev_child[c];
+
+               if (open_func(cvd) == B_FALSE)
+                       continue;
+
+               if (tq == NULL || vdev_uses_zvols(vd)) {
+                       cvd->vdev_open_error = vdev_open(cvd);
+               } else {
                         VERIFY(taskq_dispatch(tq, vdev_open_child,
-                           vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID);
+                           cvd, TQ_SLEEP) != TASKQID_INVALID);
+               }
  
+               vd->vdev_nonrot &= cvd->vdev_nonrot;
+       }
+
+       if (tq != NULL) {
+               taskq_wait(tq);
                 taskq_destroy(tq);
         }
+}
  
-       vd->vdev_nonrot = B_TRUE;
+/*
+ * Open all child vdevs.
+ */
+void
+vdev_open_children(vdev_t *vd)
+{
+       vdev_open_children_impl(vd, vdev_default_open_children_func);
+}
  
-       for (int c = 0; c < children; c++)
-               vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot;
+/*
+ * Conditionally open a subset of child vdevs.
+ */
+void
+vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
+{
+       vdev_open_children_impl(vd, open_func);
  }
  
  /*
@@ -1952,6 +2016,16 @@ vdev_open(vdev_t *vd)
                 return (error);
         }
  
+       /*
+        * Track the the minimum allocation size.
+        */
+       if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
+           vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
+               uint64_t min_alloc = vdev_get_min_alloc(vd);
+               if (min_alloc < spa->spa_min_alloc)
+                       spa->spa_min_alloc = min_alloc;
+       }
+
         /*
          * If this is a leaf vdev, assess whether a resilver is needed.
          * But don't do this if we are doing a reopen for a scrub, since
@@ -2278,7 +2352,9 @@ vdev_close(vdev_t *vd)
         vdev_t *pvd = vd->vdev_parent;
         spa_t *spa __maybe_unused = vd->vdev_spa;
  
-       ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+       ASSERT(vd != NULL);
+       ASSERT(vd->vdev_open_thread == curthread ||
+           spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
  
         /*
          * If our parent is reopening, then we are as well, unless we are
@@ -2606,10 +2682,26 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
  }
  
  /*
- * Returns B_TRUE if vdev determines offset needs to be resilvered.
+ * Check if the txg falls within the range which must be
+ * resilvered.  DVAs outside this range can always be skipped.
+ */
+boolean_t
+vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
+    uint64_t phys_birth)
+{
+       /* Set by sequential resilver. */
+       if (phys_birth == TXG_UNKNOWN)
+               return (B_TRUE);
+
+       return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1));
+}
+
+/*
+ * Returns B_TRUE if the vdev determines the DVA needs to be resilvered.
   */
  boolean_t
-vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
+vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
+    uint64_t phys_birth)
  {
         ASSERT(vd != vd->vdev_spa->spa_root_vdev);
  
@@ -2617,7 +2709,8 @@ vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
             vd->vdev_ops->vdev_op_leaf)
                 return (B_TRUE);
  
-       return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize));
+       return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize,
+           phys_birth));
  }
  
  /*
@@ -2862,8 +2955,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
                         continue;                       /* leaf vdevs only */
                 if (t == DTL_PARTIAL)
                         minref = 1;                     /* i.e. non-zero */
-               else if (vd->vdev_nparity != 0)
-                       minref = vd->vdev_nparity + 1;  /* RAID-Z */
+               else if (vdev_get_nparity(vd) != 0)
+                       minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */
                 else
                         minref = vd->vdev_children;     /* any kind of mirror */
                 space_reftree_create(&reftree);
@@ -3727,6 +3820,9 @@ top:
         if (!vd->vdev_ops->vdev_op_leaf)
                 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
  
+       if (vd->vdev_ops == &vdev_draid_spare_ops)
+               return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+
         tvd = vd->vdev_top;
         mg = tvd->vdev_mg;
         generation = spa->spa_config_generation + 1;
@@ -3971,6 +4067,13 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
  static void
  vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
  {
+       /*
+        * Exclude the dRAID spare when aggregating to avoid double counting
+        * the ops and bytes.  These IOs are counted by the physical leaves.
+        */
+       if (cvd->vdev_ops == &vdev_draid_spare_ops)
+               return;
+
         for (int t = 0; t < VS_ZIO_TYPES; t++) {
                 vs->vs_ops[t] += cvs->vs_ops[t];
                 vs->vs_bytes[t] += cvs->vs_bytes[t];
@@ -4063,7 +4166,6 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
                                 vdev_get_child_stat(cvd, vs, cvs);
                         if (vsx)
                                 vdev_get_child_stat_ex(cvd, vsx, cvsx);
-
                 }
         } else {
                 /*
@@ -4248,7 +4350,9 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
  
                         /*
                          * Repair is the result of a rebuild issued by the
-                        * rebuild thread (vdev_rebuild_thread).
+                        * rebuild thread (vdev_rebuild_thread).  To avoid
+                        * double counting repaired bytes the virtual dRAID
+                        * spare vdev is excluded from the processed bytes.
                          */
                         if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
                                 vdev_t *tvd = vd->vdev_top;
@@ -4256,8 +4360,10 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
                                 vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
                                 uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt;
  
-                               if (vd->vdev_ops->vdev_op_leaf)
+                               if (vd->vdev_ops->vdev_op_leaf &&
+                                   vd->vdev_ops != &vdev_draid_spare_ops) {
                                         atomic_add_64(rebuilt, psize);
+                               }
                                 vs->vs_rebuild_processed += psize;
                         }
  
@@ -4981,31 +5087,42 @@ vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx)
             vdev_resilver_needed(vd, NULL, NULL));
  }
  
+boolean_t
+vdev_xlate_is_empty(range_seg64_t *rs)
+{
+       return (rs->rs_start == rs->rs_end);
+}
+
  /*
- * Translate a logical range to the physical range for the specified vdev_t.
- * This function is initially called with a leaf vdev and will walk each
- * parent vdev until it reaches a top-level vdev. Once the top-level is
- * reached the physical range is initialized and the recursive function
- * begins to unwind. As it unwinds it calls the parent's vdev specific
- * translation function to do the real conversion.
+ * Translate a logical range to the first contiguous physical range for the
+ * specified vdev_t.  This function is initially called with a leaf vdev and
+ * will walk each parent vdev until it reaches a top-level vdev. Once the
+ * top-level is reached the physical range is initialized and the recursive
+ * function begins to unwind. As it unwinds it calls the parent's vdev
+ * specific translation function to do the real conversion.
   */
  void
  vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
-    range_seg64_t *physical_rs)
+    range_seg64_t *physical_rs, range_seg64_t *remain_rs)
  {
         /*
          * Walk up the vdev tree
          */
         if (vd != vd->vdev_top) {
-               vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
+               vdev_xlate(vd->vdev_parent, logical_rs, physical_rs,
+                   remain_rs);
         } else {
                 /*
-                * We've reached the top-level vdev, initialize the
-                * physical range to the logical range and start to
-                * unwind.
+                * We've reached the top-level vdev, initialize the physical
+                * range to the logical range and set an empty remaining
+                * range then start to unwind.
                  */
                 physical_rs->rs_start = logical_rs->rs_start;
                 physical_rs->rs_end = logical_rs->rs_end;
+
+               remain_rs->rs_start = logical_rs->rs_start;
+               remain_rs->rs_end = logical_rs->rs_start;
+
                 return;
         }
  
@@ -5015,16 +5132,40 @@ vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
  
         /*
          * As this recursive function unwinds, translate the logical
-        * range into its physical components by calling the
-        * vdev specific translate function.
+        * range into its physical and any remaining components by calling
+        * the vdev specific translate function.
          */
         range_seg64_t intermediate = { 0 };
-       pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
+       pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs);
  
         physical_rs->rs_start = intermediate.rs_start;
         physical_rs->rs_end = intermediate.rs_end;
  }
  
+void
+vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs,
+    vdev_xlate_func_t *func, void *arg)
+{
+       range_seg64_t iter_rs = *logical_rs;
+       range_seg64_t physical_rs;
+       range_seg64_t remain_rs;
+
+       while (!vdev_xlate_is_empty(&iter_rs)) {
+
+               vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs);
+
+               /*
+                * With raidz and dRAID, it's possible that the logical range
+                * does not live on this leaf vdev. Only when there is a non-
+                * zero physical size call the provided function.
+                */
+               if (!vdev_xlate_is_empty(&physical_rs))
+                       func(arg, &physical_rs);
+
+               iter_rs = remain_rs;
+       }
+}
+
  /*
   * Look at the vdev tree and determine whether any devices are currently being
   * replaced.