RAID-Z expansion feature

[mirror_zfs.git] / module / zfs / vdev.c
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c

index afb01c0ef7fd50f83b19d397426037b1eda97654..c10c78ebf6db8c7278ab328a59bfe12c65393850 100644 (file)
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -58,6 +58,7 @@
  #include <sys/abd.h>
  #include <sys/vdev_initialize.h>
  #include <sys/vdev_trim.h>
+#include <sys/vdev_raidz.h>
  #include <sys/zvol.h>
  #include <sys/zfs_ratelimit.h>
  #include "zfs_prop.h"
@@ -305,13 +306,13 @@ vdev_derive_alloc_bias(const char *bias)
   * all children.  This is what's used by anything other than RAID-Z.
   */
  uint64_t
-vdev_default_asize(vdev_t *vd, uint64_t psize)
+vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
  {
         uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
         uint64_t csize;
  
         for (int c = 0; c < vd->vdev_children; c++) {
-               csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
+               csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg);
                 asize = MAX(asize, csize);
         }
  
@@ -930,6 +931,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
                     &vd->vdev_removing);
                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
                     &vd->vdev_top_zap);
+               vd->vdev_rz_expanding = nvlist_exists(nv,
+                   ZPOOL_CONFIG_RAIDZ_EXPANDING);
         } else {
                 ASSERT0(vd->vdev_top_zap);
         }
@@ -1692,6 +1695,8 @@ vdev_probe_done(zio_t *zio)
  
                 vd->vdev_cant_read |= !vps->vps_readable;
                 vd->vdev_cant_write |= !vps->vps_writeable;
+               vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u",
+                   vd->vdev_cant_read, vd->vdev_cant_write);
  
                 if (vdev_readable(vd) &&
                     (vdev_writeable(vd) || !spa_writeable(spa))) {
@@ -1913,17 +1918,20 @@ vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
  }
  
  /*
- * Compute the raidz-deflation ratio.  Note, we hard-code
- * in 128k (1 << 17) because it is the "typical" blocksize.
- * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
- * otherwise it would inconsistently account for existing bp's.
+ * Compute the raidz-deflation ratio.  Note, we hard-code 128k (1 << 17)
+ * because it is the "typical" blocksize.  Even though SPA_MAXBLOCKSIZE
+ * changed, this algorithm can not change, otherwise it would inconsistently
+ * account for existing bp's.  We also hard-code txg 0 for the same reason
+ * since expanded RAIDZ vdevs can use a different asize for different birth
+ * txg's.
   */
  static void
  vdev_set_deflate_ratio(vdev_t *vd)
  {
         if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {
                 vd->vdev_deflate_ratio = (1 << 17) /
-                   (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
+                   (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >>
+                   SPA_MINBLOCKSHIFT);
         }
  }
  
@@ -3228,32 +3236,43 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
  
                 if (txg != 0)
                         vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
-               return;
+       } else {
+               mutex_enter(&vd->vdev_dtl_lock);
+               for (int t = 0; t < DTL_TYPES; t++) {
+                       /* account for child's outage in parent's missing map */
+                       int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
+                       if (t == DTL_SCRUB) {
+                               /* leaf vdevs only */
+                               continue;
+                       }
+                       if (t == DTL_PARTIAL) {
+                               /* i.e. non-zero */
+                               minref = 1;
+                       } else if (vdev_get_nparity(vd) != 0) {
+                               /* RAIDZ, DRAID */
+                               minref = vdev_get_nparity(vd) + 1;
+                       } else {
+                               /* any kind of mirror */
+                               minref = vd->vdev_children;
+                       }
+                       space_reftree_create(&reftree);
+                       for (int c = 0; c < vd->vdev_children; c++) {
+                               vdev_t *cvd = vd->vdev_child[c];
+                               mutex_enter(&cvd->vdev_dtl_lock);
+                               space_reftree_add_map(&reftree,
+                                   cvd->vdev_dtl[s], 1);
+                               mutex_exit(&cvd->vdev_dtl_lock);
+                       }
+                       space_reftree_generate_map(&reftree,
+                           vd->vdev_dtl[t], minref);
+                       space_reftree_destroy(&reftree);
+               }
+               mutex_exit(&vd->vdev_dtl_lock);
         }
  
-       mutex_enter(&vd->vdev_dtl_lock);
-       for (int t = 0; t < DTL_TYPES; t++) {
-               /* account for child's outage in parent's missing map */
-               int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
-               if (t == DTL_SCRUB)
-                       continue;                       /* leaf vdevs only */
-               if (t == DTL_PARTIAL)
-                       minref = 1;                     /* i.e. non-zero */
-               else if (vdev_get_nparity(vd) != 0)
-                       minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */
-               else
-                       minref = vd->vdev_children;     /* any kind of mirror */
-               space_reftree_create(&reftree);
-               for (int c = 0; c < vd->vdev_children; c++) {
-                       vdev_t *cvd = vd->vdev_child[c];
-                       mutex_enter(&cvd->vdev_dtl_lock);
-                       space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
-                       mutex_exit(&cvd->vdev_dtl_lock);
-               }
-               space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
-               space_reftree_destroy(&reftree);
+       if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) {
+               raidz_dtl_reassessed(vd);
         }
-       mutex_exit(&vd->vdev_dtl_lock);
  }
  
  /*
@@ -3628,6 +3647,12 @@ vdev_load(vdev_t *vd)
  
         vdev_set_deflate_ratio(vd);
  
+       if (vd->vdev_ops == &vdev_raidz_ops) {
+               error = vdev_raidz_load(vd);
+               if (error != 0)
+                       return (error);
+       }
+
         /*
          * On spa_load path, grab the allocation bias from our zap
          */
@@ -4005,10 +4030,22 @@ vdev_sync(vdev_t *vd, uint64_t txg)
         dmu_tx_commit(tx);
  }
  
+/*
+ * Return the amount of space that should be (or was) allocated for the given
+ * psize (compressed block size) in the given TXG. Note that for expanded
+ * RAIDZ vdevs, the size allocated for older BP's may be larger. See
+ * vdev_raidz_asize().
+ */
+uint64_t
+vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg)
+{
+       return (vd->vdev_ops->vdev_op_asize(vd, psize, txg));
+}
+
  uint64_t
  vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
  {
-       return (vd->vdev_ops->vdev_op_asize(vd, psize));
+       return (vdev_psize_to_asize_txg(vd, psize, 0));
  }
  
  /*
@@ -4174,9 +4211,6 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
         if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
                 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
  
-       if (!vd->vdev_ops->vdev_op_leaf)
-               return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
-
         wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
         oldstate = vd->vdev_state;
  
@@ -5457,7 +5491,9 @@ vdev_expand(vdev_t *vd, uint64_t txg)
  
         vdev_set_deflate_ratio(vd);
  
-       if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
+       if ((vd->vdev_spa->spa_raidz_expand == NULL ||
+           vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) &&
+           (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
             vdev_is_concrete(vd)) {
                 vdev_metaslab_group_create(vd);
                 VERIFY(vdev_metaslab_init(vd, txg) == 0);
@@ -6209,6 +6245,14 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
                                 vdev_prop_add_list(outnvl, propname, NULL,
                                     vd->vdev_removing, ZPROP_SRC_NONE);
                                 continue;
+                       case VDEV_PROP_RAIDZ_EXPANDING:
+                               /* Only expose this for raidz */
+                               if (vd->vdev_ops == &vdev_raidz_ops) {
+                                       vdev_prop_add_list(outnvl, propname,
+                                           NULL, vd->vdev_rz_expanding,
+                                           ZPROP_SRC_NONE);
+                               }
+                               continue;
                         /* Numeric Properites */
                         case VDEV_PROP_ALLOCATING:
                                 /* Leaf vdevs cannot have this property */