]> git.proxmox.com Git - mirror_zfs.git/blobdiff - module/zfs/vdev.c
Set aside a metaslab for ZIL blocks
[mirror_zfs.git] / module / zfs / vdev.c
index 7ffe924212da9063f2166c0b4aab5090cac20af0..f305da6f56e7513829cf5534de262d8db1023138 100644 (file)
 #include <sys/zvol.h>
 #include <sys/zfs_ratelimit.h>
 
+/*
+ * One metaslab from each (normal-class) vdev is used by the ZIL.  These are
+ * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are
+ * part of the spa_embedded_log_class.  The metaslab with the most free space
+ * in each vdev is selected for this purpose when the pool is opened (or a
+ * vdev is added).  See vdev_metaslab_init().
+ *
+ * Log blocks can be allocated from the following locations.  Each one is tried
+ * in order until the allocation succeeds:
+ * 1. dedicated log vdevs, aka "slog" (spa_log_class)
+ * 2. embedded slog metaslabs (spa_embedded_log_class)
+ * 3. other metaslabs in normal vdevs (spa_normal_class)
+ *
+ * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer
+ * than this number of metaslabs in the vdev.  This ensures that we don't set
+ * aside an unreasonable amount of space for the ZIL.  If set to less than
+ * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced
+ * (by more than 1<<spa_slop_shift) due to the embedded slog metaslab.
+ */
+int zfs_embedded_slog_min_ms = 64;
+
 /* default target for number of metaslabs per top-level vdev */
 int zfs_vdev_default_ms_count = 200;
 
@@ -223,6 +244,22 @@ vdev_getops(const char *type)
        return (ops);
 }
 
+/*
+ * Given a vdev and a metaslab class, find which metaslab group we're
+ * interested in. All vdevs may belong to two different metaslab classes.
+ * Dedicated slog devices use only the primary metaslab group, rather than a
+ * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL.
+ */
+metaslab_group_t *
+vdev_get_mg(vdev_t *vd, metaslab_class_t *mc)
+{
+       if (mc == spa_embedded_log_class(vd->vdev_spa) &&
+           vd->vdev_log_mg != NULL)
+               return (vd->vdev_log_mg);
+       else
+               return (vd->vdev_mg);
+}
+
 /* ARGSUSED */
 void
 vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
@@ -978,6 +1015,11 @@ vdev_free(vdev_t *vd)
                metaslab_group_destroy(vd->vdev_mg);
                vd->vdev_mg = NULL;
        }
+       if (vd->vdev_log_mg != NULL) {
+               ASSERT0(vd->vdev_ms_count);
+               metaslab_group_destroy(vd->vdev_log_mg);
+               vd->vdev_log_mg = NULL;
+       }
 
        ASSERT0(vd->vdev_stat.vs_space);
        ASSERT0(vd->vdev_stat.vs_dspace);
@@ -1098,14 +1140,20 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 
        if (tvd->vdev_mg)
                ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
+       if (tvd->vdev_log_mg)
+               ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg);
        tvd->vdev_mg = svd->vdev_mg;
+       tvd->vdev_log_mg = svd->vdev_log_mg;
        tvd->vdev_ms = svd->vdev_ms;
 
        svd->vdev_mg = NULL;
+       svd->vdev_log_mg = NULL;
        svd->vdev_ms = NULL;
 
        if (tvd->vdev_mg != NULL)
                tvd->vdev_mg->mg_vd = tvd;
+       if (tvd->vdev_log_mg != NULL)
+               tvd->vdev_log_mg->mg_vd = tvd;
 
        tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
        svd->vdev_checkpoint_sm = NULL;
@@ -1283,7 +1331,7 @@ vdev_remove_parent(vdev_t *cvd)
        vdev_free(mvd);
 }
 
-static void
+void
 vdev_metaslab_group_create(vdev_t *vd)
 {
        spa_t *spa = vd->vdev_spa;
@@ -1317,6 +1365,11 @@ vdev_metaslab_group_create(vdev_t *vd)
                vd->vdev_mg = metaslab_group_create(mc, vd,
                    spa->spa_alloc_count);
 
+               if (!vd->vdev_islog) {
+                       vd->vdev_log_mg = metaslab_group_create(
+                           spa_embedded_log_class(spa), vd, 1);
+               }
+
                /*
                 * The spa ashift min/max only apply for the normal metaslab
                 * class. Class destination is late binding so ashift boundry
@@ -1340,8 +1393,6 @@ int
 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 {
        spa_t *spa = vd->vdev_spa;
-       objset_t *mos = spa->spa_meta_objset;
-       uint64_t m;
        uint64_t oldc = vd->vdev_ms_count;
        uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
        metaslab_t **mspp;
@@ -1369,16 +1420,17 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 
        vd->vdev_ms = mspp;
        vd->vdev_ms_count = newc;
-       for (m = oldc; m < newc; m++) {
-               uint64_t object = 0;
 
+       for (uint64_t m = oldc; m < newc; m++) {
+               uint64_t object = 0;
                /*
                 * vdev_ms_array may be 0 if we are creating the "fake"
                 * metaslabs for an indirect vdev for zdb's leak detection.
                 * See zdb_leak_init().
                 */
                if (txg == 0 && vd->vdev_ms_array != 0) {
-                       error = dmu_read(mos, vd->vdev_ms_array,
+                       error = dmu_read(spa->spa_meta_objset,
+                           vd->vdev_ms_array,
                            m * sizeof (uint64_t), sizeof (uint64_t), &object,
                            DMU_READ_PREFETCH);
                        if (error != 0) {
@@ -1388,17 +1440,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
                        }
                }
 
-#ifndef _KERNEL
-               /*
-                * To accommodate zdb_leak_init() fake indirect
-                * metaslabs, we allocate a metaslab group for
-                * indirect vdevs which normally don't have one.
-                */
-               if (vd->vdev_mg == NULL) {
-                       ASSERT0(vdev_is_concrete(vd));
-                       vdev_metaslab_group_create(vd);
-               }
-#endif
                error = metaslab_init(vd->vdev_mg, m, object, txg,
                    &(vd->vdev_ms[m]));
                if (error != 0) {
@@ -1408,6 +1449,47 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
                }
        }
 
+       /*
+        * Find the emptiest metaslab on the vdev and mark it for use for
+        * embedded slog by moving it from the regular to the log metaslab
+        * group.
+        */
+       if (vd->vdev_mg->mg_class == spa_normal_class(spa) &&
+           vd->vdev_ms_count > zfs_embedded_slog_min_ms &&
+           avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) {
+               uint64_t slog_msid = 0;
+               uint64_t smallest = UINT64_MAX;
+
+               /*
+                * Note, we only search the new metaslabs, because the old
+                * (pre-existing) ones may be active (e.g. have non-empty
+                * range_tree's), and we don't move them to the new
+                * metaslab_t.
+                */
+               for (uint64_t m = oldc; m < newc; m++) {
+                       uint64_t alloc =
+                           space_map_allocated(vd->vdev_ms[m]->ms_sm);
+                       if (alloc < smallest) {
+                               slog_msid = m;
+                               smallest = alloc;
+                       }
+               }
+               metaslab_t *slog_ms = vd->vdev_ms[slog_msid];
+               /*
+                * The metaslab was marked as dirty at the end of
+                * metaslab_init(). Remove it from the dirty list so that we
+                * can uninitialize and reinitialize it to the new class.
+                */
+               if (txg != 0) {
+                       (void) txg_list_remove_this(&vd->vdev_ms_list,
+                           slog_ms, txg);
+               }
+               uint64_t sm_obj = space_map_object(slog_ms->ms_sm);
+               metaslab_fini(slog_ms);
+               VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg,
+                   &vd->vdev_ms[slog_msid]));
+       }
+
        if (txg == 0)
                spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
 
@@ -1418,6 +1500,8 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
         */
        if (!expanding && !vd->vdev_removing) {
                metaslab_group_activate(vd->vdev_mg);
+               if (vd->vdev_log_mg != NULL)
+                       metaslab_group_activate(vd->vdev_log_mg);
        }
 
        if (txg == 0)
@@ -1453,7 +1537,12 @@ vdev_metaslab_fini(vdev_t *vd)
 
        if (vd->vdev_ms != NULL) {
                metaslab_group_t *mg = vd->vdev_mg;
+
                metaslab_group_passivate(mg);
+               if (vd->vdev_log_mg != NULL) {
+                       ASSERT(!vd->vdev_islog);
+                       metaslab_group_passivate(vd->vdev_log_mg);
+               }
 
                uint64_t count = vd->vdev_ms_count;
                for (uint64_t m = 0; m < count; m++) {
@@ -1463,11 +1552,13 @@ vdev_metaslab_fini(vdev_t *vd)
                }
                vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
                vd->vdev_ms = NULL;
-
                vd->vdev_ms_count = 0;
 
-               for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+               for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
                        ASSERT0(mg->mg_histogram[i]);
+                       if (vd->vdev_log_mg != NULL)
+                               ASSERT0(vd->vdev_log_mg->mg_histogram[i]);
+               }
        }
        ASSERT0(vd->vdev_ms_count);
        ASSERT3U(vd->vdev_pending_fastwrite, ==, 0);
@@ -3531,8 +3622,11 @@ vdev_sync_done(vdev_t *vd, uint64_t txg)
            != NULL)
                metaslab_sync_done(msp, txg);
 
-       if (reassess)
+       if (reassess) {
                metaslab_sync_reassess(vd->vdev_mg);
+               if (vd->vdev_log_mg != NULL)
+                       metaslab_sync_reassess(vd->vdev_log_mg);
+       }
 }
 
 void
@@ -3856,6 +3950,7 @@ top:
                        /*
                         * Prevent any future allocations.
                         */
+                       ASSERT3P(tvd->vdev_log_mg, ==, NULL);
                        metaslab_group_passivate(mg);
                        (void) spa_vdev_state_exit(spa, vd, 0);
 
@@ -4256,6 +4351,12 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
                 */
                if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
                    vdev_is_concrete(vd)) {
+                       /*
+                        * The vdev fragmentation rating doesn't take into
+                        * account the embedded slog metaslab (vdev_log_mg).
+                        * Since it's only one metaslab, it would have a tiny
+                        * impact on the overall fragmentation.
+                        */
                        vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
                            vd->vdev_mg->mg_fragmentation : 0;
                }
@@ -5234,6 +5335,9 @@ ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW,
        "Disable cache flushes");
 
+ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, INT, ZMOD_RW,
+       "Minimum number of metaslabs required to dedicate one for log blocks");
+
 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift,
        param_set_min_auto_ashift, param_get_ulong, ZMOD_RW,
        "Minimum ashift used when creating new top-level vdevs");