OpenZFS 7614, 9064 - zfs device evacuation/removal

[mirror_zfs.git] / module / zfs / dmu.c
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c

index 1aba0b133e408c0a5131c00374c6912cc48d1274..0352393dc27654e41f6ecdca5fd7ccf9e58500f7 100644 (file)
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -73,6 +73,13 @@ unsigned long zfs_per_txg_dirty_frees_percent = 30;
   */
  int zfs_dmu_offset_next_sync = 0;
  
+/*
+ * This can be used for testing, to ensure that certain actions happen
+ * while in the middle of a remap (which might otherwise complete too
+ * quickly).
+ */
+int zfs_object_remap_one_indirect_delay_ticks = 0;
+
  const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
         { DMU_BSWAP_UINT8,      TRUE,   FALSE,  "unallocated"           },
         { DMU_BSWAP_ZAP,        TRUE,   FALSE,  "object directory"      },
@@ -113,8 +120,8 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
         { DMU_BSWAP_UINT64,     TRUE,   FALSE,  "FUID table size"       },
         { DMU_BSWAP_ZAP,        TRUE,   FALSE,  "DSL dataset next clones"},
         { DMU_BSWAP_ZAP,        TRUE,   FALSE,  "scan work queue"       },
-       { DMU_BSWAP_ZAP,        TRUE,   TRUE,   "ZFS user/group used"   },
-       { DMU_BSWAP_ZAP,        TRUE,   TRUE,   "ZFS user/group quota"  },
+       { DMU_BSWAP_ZAP,        TRUE,   TRUE,   "ZFS user/group/project used" },
+       { DMU_BSWAP_ZAP,        TRUE,   TRUE,   "ZFS user/group/project quota"},
         { DMU_BSWAP_ZAP,        TRUE,   FALSE,  "snapshot refcount tags"},
         { DMU_BSWAP_ZAP,        TRUE,   FALSE,  "DDT ZAP algorithm"     },
         { DMU_BSWAP_ZAP,        TRUE,   FALSE,  "DDT statistics"        },
@@ -675,10 +682,8 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
         }
  
         if (nblks != 0) {
-               int i;
-
                 blkid = dbuf_whichblock(dn, level, offset);
-               for (i = 0; i < nblks; i++)
+               for (int i = 0; i < nblks; i++)
                         dbuf_prefetch(dn, level, blkid + i, pri, 0);
         }
  
@@ -704,7 +709,6 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum)
         /* bytes of data covered by a level-1 indirect block */
         uint64_t iblkrange =
             dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
-       uint64_t blks;
  
         ASSERT3U(minimum, <=, *start);
  
@@ -714,7 +718,7 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum)
         }
         ASSERT(ISP2(iblkrange));
  
-       for (blks = 0; *start > minimum && blks < maxblks; blks++) {
+       for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) {
                 int err;
  
                 /*
@@ -767,7 +771,6 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
         int err;
         uint64_t dirty_frees_threshold;
         dsl_pool_t *dp = dmu_objset_pool(os);
-       int t;
  
         if (dn == NULL)
                 return (SET_ERROR(EINVAL));
@@ -805,7 +808,7 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
                 chunk_len = chunk_end - chunk_begin;
  
                 mutex_enter(&dp->dp_lock);
-               for (t = 0; t < TXG_SIZE; t++) {
+               for (int t = 0; t < TXG_SIZE; t++) {
                         long_free_dirty_all_txgs +=
                             dp->dp_long_free_dirty_pertxg[t];
                 }
@@ -851,8 +854,11 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
  
                         while (dr != NULL && dr->dr_txg > tx->tx_txg)
                                 dr = dr->dr_next;
-                       if (dr != NULL && dr->dr_txg == tx->tx_txg)
+                       if (dr != NULL && dr->dr_txg == tx->tx_txg) {
                                 dr->dt.dl.dr_raw = B_TRUE;
+                               dn->dn_objset->os_next_write_raw
+                                   [tx->tx_txg & TXG_MASK] = B_TRUE;
+                       }
                 }
  
                 dmu_tx_commit(tx);
@@ -932,9 +938,10 @@ dmu_free_long_object_impl(objset_t *os, uint64_t object, boolean_t raw)
         dmu_tx_mark_netfree(tx);
         err = dmu_tx_assign(tx, TXG_WAIT);
         if (err == 0) {
-               err = dmu_object_free(os, object, tx);
-               if (err == 0 && raw)
-                       VERIFY0(dmu_object_dirty_raw(os, object, tx));
+               if (raw)
+                       err = dmu_object_dirty_raw(os, object, tx);
+               if (err == 0)
+                       err = dmu_object_free(os, object, tx);
  
                 dmu_tx_commit(tx);
         } else {
@@ -966,7 +973,7 @@ dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
         if (err)
                 return (err);
         ASSERT(offset < UINT64_MAX);
-       ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
+       ASSERT(size == DMU_OBJECT_END || size <= UINT64_MAX - offset);
         dnode_free_range(dn, offset, size, tx);
         dnode_rele(dn, FTAG);
         return (0);
@@ -1114,6 +1121,123 @@ dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
         dmu_buf_rele_array(dbp, numbufs, FTAG);
  }
  
+static int
+dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn,
+    uint64_t last_removal_txg, uint64_t offset)
+{
+       uint64_t l1blkid = dbuf_whichblock(dn, 1, offset);
+       int err = 0;
+
+       rw_enter(&dn->dn_struct_rwlock, RW_READER);
+       dmu_buf_impl_t *dbuf = dbuf_hold_level(dn, 1, l1blkid, FTAG);
+       ASSERT3P(dbuf, !=, NULL);
+
+       /*
+        * If the block hasn't been written yet, this default will ensure
+        * we don't try to remap it.
+        */
+       uint64_t birth = UINT64_MAX;
+       ASSERT3U(last_removal_txg, !=, UINT64_MAX);
+       if (dbuf->db_blkptr != NULL)
+               birth = dbuf->db_blkptr->blk_birth;
+       rw_exit(&dn->dn_struct_rwlock);
+
+       /*
+        * If this L1 was already written after the last removal, then we've
+        * already tried to remap it.
+        */
+       if (birth <= last_removal_txg &&
+           dbuf_read(dbuf, NULL, DB_RF_MUST_SUCCEED) == 0 &&
+           dbuf_can_remap(dbuf)) {
+               dmu_tx_t *tx = dmu_tx_create(os);
+               dmu_tx_hold_remap_l1indirect(tx, dn->dn_object);
+               err = dmu_tx_assign(tx, TXG_WAIT);
+               if (err == 0) {
+                       (void) dbuf_dirty(dbuf, tx);
+                       dmu_tx_commit(tx);
+               } else {
+                       dmu_tx_abort(tx);
+               }
+       }
+
+       dbuf_rele(dbuf, FTAG);
+
+       delay(zfs_object_remap_one_indirect_delay_ticks);
+
+       return (err);
+}
+
+/*
+ * Remap all blockpointers in the object, if possible, so that they reference
+ * only concrete vdevs.
+ *
+ * To do this, iterate over the L0 blockpointers and remap any that reference
+ * an indirect vdev. Note that we only examine L0 blockpointers; since we
+ * cannot guarantee that we can remap all blockpointer anyways (due to split
+ * blocks), we do not want to make the code unnecessarily complicated to
+ * catch the unlikely case that there is an L1 block on an indirect vdev that
+ * contains no indirect blockpointers.
+ */
+int
+dmu_object_remap_indirects(objset_t *os, uint64_t object,
+    uint64_t last_removal_txg)
+{
+       uint64_t offset, l1span;
+       int err;
+       dnode_t *dn;
+
+       err = dnode_hold(os, object, FTAG, &dn);
+       if (err != 0) {
+               return (err);
+       }
+
+       if (dn->dn_nlevels <= 1) {
+               if (issig(JUSTLOOKING) && issig(FORREAL)) {
+                       err = SET_ERROR(EINTR);
+               }
+
+               /*
+                * If the dnode has no indirect blocks, we cannot dirty them.
+                * We still want to remap the blkptr(s) in the dnode if
+                * appropriate, so mark it as dirty.
+                */
+               if (err == 0 && dnode_needs_remap(dn)) {
+                       dmu_tx_t *tx = dmu_tx_create(os);
+                       dmu_tx_hold_bonus(tx, dn->dn_object);
+                       if ((err = dmu_tx_assign(tx, TXG_WAIT)) == 0) {
+                               dnode_setdirty(dn, tx);
+                               dmu_tx_commit(tx);
+                       } else {
+                               dmu_tx_abort(tx);
+                       }
+               }
+
+               dnode_rele(dn, FTAG);
+               return (err);
+       }
+
+       offset = 0;
+       l1span = 1ULL << (dn->dn_indblkshift - SPA_BLKPTRSHIFT +
+           dn->dn_datablkshift);
+       /*
+        * Find the next L1 indirect that is not a hole.
+        */
+       while (dnode_next_offset(dn, 0, &offset, 2, 1, 0) == 0) {
+               if (issig(JUSTLOOKING) && issig(FORREAL)) {
+                       err = SET_ERROR(EINTR);
+                       break;
+               }
+               if ((err = dmu_object_remap_one_indirect(os, dn,
+                   last_removal_txg, offset)) != 0) {
+                       break;
+               }
+               offset += l1span;
+       }
+
+       dnode_rele(dn, FTAG);
+       return (err);
+}
+
  void
  dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
      dmu_tx_t *tx)
@@ -1542,29 +1666,39 @@ dmu_return_arcbuf(arc_buf_t *buf)
         arc_buf_destroy(buf, FTAG);
  }
  
-void
-dmu_convert_to_raw(dmu_buf_t *handle, boolean_t byteorder, const uint8_t *salt,
-    const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx)
+int
+dmu_convert_mdn_block_to_raw(objset_t *os, uint64_t firstobj,
+    boolean_t byteorder, const uint8_t *salt, const uint8_t *iv,
+    const uint8_t *mac, dmu_tx_t *tx)
  {
-       dmu_object_type_t type;
-       dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
-       uint64_t dsobj = dmu_objset_id(db->db_objset);
+       int ret;
+       dmu_buf_t *handle = NULL;
+       dmu_buf_impl_t *db = NULL;
+       uint64_t offset = firstobj * DNODE_MIN_SIZE;
+       uint64_t dsobj = dmu_objset_id(os);
  
-       ASSERT3P(db->db_buf, !=, NULL);
-       ASSERT3U(dsobj, !=, 0);
+       ret = dmu_buf_hold_by_dnode(DMU_META_DNODE(os), offset, FTAG, &handle,
+           DMU_READ_PREFETCH | DMU_READ_NO_DECRYPT);
+       if (ret != 0)
+               return (ret);
  
         dmu_buf_will_change_crypt_params(handle, tx);
  
-       DB_DNODE_ENTER(db);
-       type = DB_DNODE(db)->dn_type;
-       DB_DNODE_EXIT(db);
+       db = (dmu_buf_impl_t *)handle;
+       ASSERT3P(db->db_buf, !=, NULL);
+       ASSERT3U(dsobj, !=, 0);
  
         /*
          * This technically violates the assumption the dmu code makes
          * that dnode blocks are only released in syncing context.
          */
         (void) arc_release(db->db_buf, db);
-       arc_convert_to_raw(db->db_buf, dsobj, byteorder, type, salt, iv, mac);
+       arc_convert_to_raw(db->db_buf, dsobj, byteorder, DMU_OT_DNODE,
+           salt, iv, mac);
+
+       dmu_buf_rele(handle, FTAG);
+
+       return (0);
  }
  
  void
@@ -1786,6 +1920,13 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
                 return (SET_ERROR(EIO));
         }
  
+       /*
+        * In order to prevent the zgd's lwb from being free'd prior to
+        * dmu_sync_late_arrival_done() being called, we have to ensure
+        * the lwb's "max txg" takes this tx's txg into account.
+        */
+       zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx));
+
         dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
         dsa->dsa_dr = NULL;
         dsa->dsa_done = done;
@@ -2025,6 +2166,23 @@ dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
         return (err);
  }
  
+int
+dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid,
+    dmu_tx_t *tx)
+{
+       dnode_t *dn;
+       int err;
+
+       err = dnode_hold(os, object, FTAG, &dn);
+       if (err)
+               return (err);
+       rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+       dnode_new_blkid(dn, maxblkid, tx, B_FALSE);
+       rw_exit(&dn->dn_struct_rwlock);
+       dnode_rele(dn, FTAG);
+       return (0);
+}
+
  void
  dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
      dmu_tx_t *tx)
@@ -2083,8 +2241,6 @@ dmu_object_dirty_raw(objset_t *os, uint64_t object, dmu_tx_t *tx)
         return (err);
  }
  
-int zfs_mdcomp_disable = 0;
-
  /*
   * When the "redundant_metadata" property is set to "most", only indirect
   * blocks of this level and higher will have an additional ditto block.
@@ -2114,16 +2270,12 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
          *       3. all other level 0 blocks
          */
         if (ismd) {
-               if (zfs_mdcomp_disable) {
-                       compress = ZIO_COMPRESS_EMPTY;
-               } else {
-                       /*
-                        * XXX -- we should design a compression algorithm
-                        * that specializes in arrays of bps.
-                        */
-                       compress = zio_compress_select(os->os_spa,
-                           ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
-               }
+               /*
+                * XXX -- we should design a compression algorithm
+                * that specializes in arrays of bps.
+                */
+               compress = zio_compress_select(os->os_spa,
+                   ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
  
                 /*
                  * Metadata always gets checksummed.  If the data
@@ -2210,8 +2362,10 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
                         dedup = B_FALSE;
                 }
  
-               if (type == DMU_OT_DNODE || type == DMU_OT_OBJSET)
+               if (level <= 0 &&
+                   (type == DMU_OT_DNODE || type == DMU_OT_OBJSET)) {
                         compress = ZIO_COMPRESS_EMPTY;
+               }
         }
  
         zp->zp_compress = compress;
@@ -2253,12 +2407,10 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
         /*
          * Check if dnode is dirty
          */
-       if (dn->dn_dirtyctx != DN_UNDIRTIED) {
-               for (i = 0; i < TXG_SIZE; i++) {
-                       if (!list_is_empty(&dn->dn_dirty_records[i])) {
-                               clean = B_FALSE;
-                               break;
-                       }
+       for (i = 0; i < TXG_SIZE; i++) {
+               if (multilist_link_active(&dn->dn_dirty_link[i])) {
+                       clean = B_FALSE;
+                       break;
                 }
         }
  
@@ -2290,7 +2442,6 @@ void
  __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
  {
         dnode_phys_t *dnp = dn->dn_phys;
-       int i;
  
         doi->doi_data_block_size = dn->dn_datablksz;
         doi->doi_metadata_block_size = dn->dn_indblkshift ?
@@ -2306,7 +2457,7 @@ __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
         doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
         doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
         doi->doi_fill_count = 0;
-       for (i = 0; i < dnp->dn_nblkptr; i++)
+       for (int i = 0; i < dnp->dn_nblkptr; i++)
                 doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
  }
  
@@ -2487,6 +2638,7 @@ EXPORT_SYMBOL(dmu_object_size_from_db);
  EXPORT_SYMBOL(dmu_object_dnsize_from_db);
  EXPORT_SYMBOL(dmu_object_set_nlevels);
  EXPORT_SYMBOL(dmu_object_set_blocksize);
+EXPORT_SYMBOL(dmu_object_set_maxblkid);
  EXPORT_SYMBOL(dmu_object_set_checksum);
  EXPORT_SYMBOL(dmu_object_set_compress);
  EXPORT_SYMBOL(dmu_write_policy);
@@ -2499,9 +2651,6 @@ EXPORT_SYMBOL(dmu_buf_hold);
  EXPORT_SYMBOL(dmu_ot);
  
  /* BEGIN CSTYLED */
-module_param(zfs_mdcomp_disable, int, 0644);
-MODULE_PARM_DESC(zfs_mdcomp_disable, "Disable meta data compression");
-
  module_param(zfs_nopwrite_enabled, int, 0644);
  MODULE_PARM_DESC(zfs_nopwrite_enabled, "Enable NOP writes");