]> git.proxmox.com Git - mirror_zfs.git/blobdiff - module/zfs/dmu.c
OpenZFS 7614, 9064 - zfs device evacuation/removal
[mirror_zfs.git] / module / zfs / dmu.c
index cb86800f4bb374e2efcaf160c31e826b6e0ec58d..0352393dc27654e41f6ecdca5fd7ccf9e58500f7 100644 (file)
@@ -73,6 +73,13 @@ unsigned long zfs_per_txg_dirty_frees_percent = 30;
  */
 int zfs_dmu_offset_next_sync = 0;
 
+/*
+ * This can be used for testing, to ensure that certain actions happen
+ * while in the middle of a remap (which might otherwise complete too
+ * quickly).
+ */
+int zfs_object_remap_one_indirect_delay_ticks = 0;
+
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
        { DMU_BSWAP_UINT8,      TRUE,   FALSE,  "unallocated"           },
        { DMU_BSWAP_ZAP,        TRUE,   FALSE,  "object directory"      },
@@ -847,8 +854,11 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
 
                        while (dr != NULL && dr->dr_txg > tx->tx_txg)
                                dr = dr->dr_next;
-                       if (dr != NULL && dr->dr_txg == tx->tx_txg)
+                       if (dr != NULL && dr->dr_txg == tx->tx_txg) {
                                dr->dt.dl.dr_raw = B_TRUE;
+                               dn->dn_objset->os_next_write_raw
+                                   [tx->tx_txg & TXG_MASK] = B_TRUE;
+                       }
                }
 
                dmu_tx_commit(tx);
@@ -1111,6 +1121,123 @@ dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
        dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
+static int
+dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn,
+    uint64_t last_removal_txg, uint64_t offset)
+{
+       uint64_t l1blkid = dbuf_whichblock(dn, 1, offset);
+       int err = 0;
+
+       rw_enter(&dn->dn_struct_rwlock, RW_READER);
+       dmu_buf_impl_t *dbuf = dbuf_hold_level(dn, 1, l1blkid, FTAG);
+       ASSERT3P(dbuf, !=, NULL);
+
+       /*
+        * If the block hasn't been written yet, this default will ensure
+        * we don't try to remap it.
+        */
+       uint64_t birth = UINT64_MAX;
+       ASSERT3U(last_removal_txg, !=, UINT64_MAX);
+       if (dbuf->db_blkptr != NULL)
+               birth = dbuf->db_blkptr->blk_birth;
+       rw_exit(&dn->dn_struct_rwlock);
+
+       /*
+        * If this L1 was already written after the last removal, then we've
+        * already tried to remap it.
+        */
+       if (birth <= last_removal_txg &&
+           dbuf_read(dbuf, NULL, DB_RF_MUST_SUCCEED) == 0 &&
+           dbuf_can_remap(dbuf)) {
+               dmu_tx_t *tx = dmu_tx_create(os);
+               dmu_tx_hold_remap_l1indirect(tx, dn->dn_object);
+               err = dmu_tx_assign(tx, TXG_WAIT);
+               if (err == 0) {
+                       (void) dbuf_dirty(dbuf, tx);
+                       dmu_tx_commit(tx);
+               } else {
+                       dmu_tx_abort(tx);
+               }
+       }
+
+       dbuf_rele(dbuf, FTAG);
+
+       delay(zfs_object_remap_one_indirect_delay_ticks);
+
+       return (err);
+}
+
+/*
+ * Remap all blockpointers in the object, if possible, so that they reference
+ * only concrete vdevs.
+ *
+ * To do this, iterate over the L0 blockpointers and remap any that reference
+ * an indirect vdev. Note that we only examine L0 blockpointers; since we
+ * cannot guarantee that we can remap all blockpointer anyways (due to split
+ * blocks), we do not want to make the code unnecessarily complicated to
+ * catch the unlikely case that there is an L1 block on an indirect vdev that
+ * contains no indirect blockpointers.
+ */
+int
+dmu_object_remap_indirects(objset_t *os, uint64_t object,
+    uint64_t last_removal_txg)
+{
+       uint64_t offset, l1span;
+       int err;
+       dnode_t *dn;
+
+       err = dnode_hold(os, object, FTAG, &dn);
+       if (err != 0) {
+               return (err);
+       }
+
+       if (dn->dn_nlevels <= 1) {
+               if (issig(JUSTLOOKING) && issig(FORREAL)) {
+                       err = SET_ERROR(EINTR);
+               }
+
+               /*
+                * If the dnode has no indirect blocks, we cannot dirty them.
+                * We still want to remap the blkptr(s) in the dnode if
+                * appropriate, so mark it as dirty.
+                */
+               if (err == 0 && dnode_needs_remap(dn)) {
+                       dmu_tx_t *tx = dmu_tx_create(os);
+                       dmu_tx_hold_bonus(tx, dn->dn_object);
+                       if ((err = dmu_tx_assign(tx, TXG_WAIT)) == 0) {
+                               dnode_setdirty(dn, tx);
+                               dmu_tx_commit(tx);
+                       } else {
+                               dmu_tx_abort(tx);
+                       }
+               }
+
+               dnode_rele(dn, FTAG);
+               return (err);
+       }
+
+       offset = 0;
+       l1span = 1ULL << (dn->dn_indblkshift - SPA_BLKPTRSHIFT +
+           dn->dn_datablkshift);
+       /*
+        * Find the next L1 indirect that is not a hole.
+        */
+       while (dnode_next_offset(dn, 0, &offset, 2, 1, 0) == 0) {
+               if (issig(JUSTLOOKING) && issig(FORREAL)) {
+                       err = SET_ERROR(EINTR);
+                       break;
+               }
+               if ((err = dmu_object_remap_one_indirect(os, dn,
+                   last_removal_txg, offset)) != 0) {
+                       break;
+               }
+               offset += l1span;
+       }
+
+       dnode_rele(dn, FTAG);
+       return (err);
+}
+
 void
 dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     dmu_tx_t *tx)
@@ -1539,29 +1666,39 @@ dmu_return_arcbuf(arc_buf_t *buf)
        arc_buf_destroy(buf, FTAG);
 }
 
-void
-dmu_convert_to_raw(dmu_buf_t *handle, boolean_t byteorder, const uint8_t *salt,
-    const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx)
+int
+dmu_convert_mdn_block_to_raw(objset_t *os, uint64_t firstobj,
+    boolean_t byteorder, const uint8_t *salt, const uint8_t *iv,
+    const uint8_t *mac, dmu_tx_t *tx)
 {
-       dmu_object_type_t type;
-       dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
-       uint64_t dsobj = dmu_objset_id(db->db_objset);
+       int ret;
+       dmu_buf_t *handle = NULL;
+       dmu_buf_impl_t *db = NULL;
+       uint64_t offset = firstobj * DNODE_MIN_SIZE;
+       uint64_t dsobj = dmu_objset_id(os);
 
-       ASSERT3P(db->db_buf, !=, NULL);
-       ASSERT3U(dsobj, !=, 0);
+       ret = dmu_buf_hold_by_dnode(DMU_META_DNODE(os), offset, FTAG, &handle,
+           DMU_READ_PREFETCH | DMU_READ_NO_DECRYPT);
+       if (ret != 0)
+               return (ret);
 
        dmu_buf_will_change_crypt_params(handle, tx);
 
-       DB_DNODE_ENTER(db);
-       type = DB_DNODE(db)->dn_type;
-       DB_DNODE_EXIT(db);
+       db = (dmu_buf_impl_t *)handle;
+       ASSERT3P(db->db_buf, !=, NULL);
+       ASSERT3U(dsobj, !=, 0);
 
        /*
         * This technically violates the assumption the dmu code makes
         * that dnode blocks are only released in syncing context.
         */
        (void) arc_release(db->db_buf, db);
-       arc_convert_to_raw(db->db_buf, dsobj, byteorder, type, salt, iv, mac);
+       arc_convert_to_raw(db->db_buf, dsobj, byteorder, DMU_OT_DNODE,
+           salt, iv, mac);
+
+       dmu_buf_rele(handle, FTAG);
+
+       return (0);
 }
 
 void
@@ -2104,8 +2241,6 @@ dmu_object_dirty_raw(objset_t *os, uint64_t object, dmu_tx_t *tx)
        return (err);
 }
 
-int zfs_mdcomp_disable = 0;
-
 /*
  * When the "redundant_metadata" property is set to "most", only indirect
  * blocks of this level and higher will have an additional ditto block.
@@ -2135,16 +2270,12 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
         *       3. all other level 0 blocks
         */
        if (ismd) {
-               if (zfs_mdcomp_disable) {
-                       compress = ZIO_COMPRESS_EMPTY;
-               } else {
-                       /*
-                        * XXX -- we should design a compression algorithm
-                        * that specializes in arrays of bps.
-                        */
-                       compress = zio_compress_select(os->os_spa,
-                           ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
-               }
+               /*
+                * XXX -- we should design a compression algorithm
+                * that specializes in arrays of bps.
+                */
+               compress = zio_compress_select(os->os_spa,
+                   ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
 
                /*
                 * Metadata always gets checksummed.  If the data
@@ -2277,7 +2408,7 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
         * Check if dnode is dirty
         */
        for (i = 0; i < TXG_SIZE; i++) {
-               if (list_link_active(&dn->dn_dirty_link[i])) {
+               if (multilist_link_active(&dn->dn_dirty_link[i])) {
                        clean = B_FALSE;
                        break;
                }
@@ -2520,9 +2651,6 @@ EXPORT_SYMBOL(dmu_buf_hold);
 EXPORT_SYMBOL(dmu_ot);
 
 /* BEGIN CSTYLED */
-module_param(zfs_mdcomp_disable, int, 0644);
-MODULE_PARM_DESC(zfs_mdcomp_disable, "Disable meta data compression");
-
 module_param(zfs_nopwrite_enabled, int, 0644);
 MODULE_PARM_DESC(zfs_nopwrite_enabled, "Enable NOP writes");