Set aside a metaslab for ZIL blocks

[mirror_zfs.git] / module / zfs / vdev.c
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c

index 7ffe924212da9063f2166c0b4aab5090cac20af0..f305da6f56e7513829cf5534de262d8db1023138 100644 (file)
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -59,6 +59,27 @@
  #include <sys/zvol.h>
  #include <sys/zfs_ratelimit.h>
  
+/*
+ * One metaslab from each (normal-class) vdev is used by the ZIL.  These are
+ * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are
+ * part of the spa_embedded_log_class.  The metaslab with the most free space
+ * in each vdev is selected for this purpose when the pool is opened (or a
+ * vdev is added).  See vdev_metaslab_init().
+ *
+ * Log blocks can be allocated from the following locations.  Each one is tried
+ * in order until the allocation succeeds:
+ * 1. dedicated log vdevs, aka "slog" (spa_log_class)
+ * 2. embedded slog metaslabs (spa_embedded_log_class)
+ * 3. other metaslabs in normal vdevs (spa_normal_class)
+ *
+ * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer
+ * than this number of metaslabs in the vdev.  This ensures that we don't set
+ * aside an unreasonable amount of space for the ZIL.  If set to less than
+ * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced
+ * (by more than 1<<spa_slop_shift) due to the embedded slog metaslab.
+ */
+int zfs_embedded_slog_min_ms = 64;
+
  /* default target for number of metaslabs per top-level vdev */
  int zfs_vdev_default_ms_count = 200;
  
@@ -223,6 +244,22 @@ vdev_getops(const char *type)
         return (ops);
  }
  
+/*
+ * Given a vdev and a metaslab class, find which metaslab group we're
+ * interested in. All vdevs may belong to two different metaslab classes.
+ * Dedicated slog devices use only the primary metaslab group, rather than a
+ * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL.
+ */
+metaslab_group_t *
+vdev_get_mg(vdev_t *vd, metaslab_class_t *mc)
+{
+       if (mc == spa_embedded_log_class(vd->vdev_spa) &&
+           vd->vdev_log_mg != NULL)
+               return (vd->vdev_log_mg);
+       else
+               return (vd->vdev_mg);
+}
+
  /* ARGSUSED */
  void
  vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
@@ -978,6 +1015,11 @@ vdev_free(vdev_t *vd)
                 metaslab_group_destroy(vd->vdev_mg);
                 vd->vdev_mg = NULL;
         }
+       if (vd->vdev_log_mg != NULL) {
+               ASSERT0(vd->vdev_ms_count);
+               metaslab_group_destroy(vd->vdev_log_mg);
+               vd->vdev_log_mg = NULL;
+       }
  
         ASSERT0(vd->vdev_stat.vs_space);
         ASSERT0(vd->vdev_stat.vs_dspace);
@@ -1098,14 +1140,20 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
  
         if (tvd->vdev_mg)
                 ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
+       if (tvd->vdev_log_mg)
+               ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg);
         tvd->vdev_mg = svd->vdev_mg;
+       tvd->vdev_log_mg = svd->vdev_log_mg;
         tvd->vdev_ms = svd->vdev_ms;
  
         svd->vdev_mg = NULL;
+       svd->vdev_log_mg = NULL;
         svd->vdev_ms = NULL;
  
         if (tvd->vdev_mg != NULL)
                 tvd->vdev_mg->mg_vd = tvd;
+       if (tvd->vdev_log_mg != NULL)
+               tvd->vdev_log_mg->mg_vd = tvd;
  
         tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
         svd->vdev_checkpoint_sm = NULL;
@@ -1283,7 +1331,7 @@ vdev_remove_parent(vdev_t *cvd)
         vdev_free(mvd);
  }
  
-static void
+void
  vdev_metaslab_group_create(vdev_t *vd)
  {
         spa_t *spa = vd->vdev_spa;
@@ -1317,6 +1365,11 @@ vdev_metaslab_group_create(vdev_t *vd)
                 vd->vdev_mg = metaslab_group_create(mc, vd,
                     spa->spa_alloc_count);
  
+               if (!vd->vdev_islog) {
+                       vd->vdev_log_mg = metaslab_group_create(
+                           spa_embedded_log_class(spa), vd, 1);
+               }
+
                 /*
                  * The spa ashift min/max only apply for the normal metaslab
                  * class. Class destination is late binding so ashift boundry
@@ -1340,8 +1393,6 @@ int
  vdev_metaslab_init(vdev_t *vd, uint64_t txg)
  {
         spa_t *spa = vd->vdev_spa;
-       objset_t *mos = spa->spa_meta_objset;
-       uint64_t m;
         uint64_t oldc = vd->vdev_ms_count;
         uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
         metaslab_t **mspp;
@@ -1369,16 +1420,17 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
  
         vd->vdev_ms = mspp;
         vd->vdev_ms_count = newc;
-       for (m = oldc; m < newc; m++) {
-               uint64_t object = 0;
  
+       for (uint64_t m = oldc; m < newc; m++) {
+               uint64_t object = 0;
                 /*
                  * vdev_ms_array may be 0 if we are creating the "fake"
                  * metaslabs for an indirect vdev for zdb's leak detection.
                  * See zdb_leak_init().
                  */
                 if (txg == 0 && vd->vdev_ms_array != 0) {
-                       error = dmu_read(mos, vd->vdev_ms_array,
+                       error = dmu_read(spa->spa_meta_objset,
+                           vd->vdev_ms_array,
                             m * sizeof (uint64_t), sizeof (uint64_t), &object,
                             DMU_READ_PREFETCH);
                         if (error != 0) {
@@ -1388,17 +1440,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
                         }
                 }
  
-#ifndef _KERNEL
-               /*
-                * To accommodate zdb_leak_init() fake indirect
-                * metaslabs, we allocate a metaslab group for
-                * indirect vdevs which normally don't have one.
-                */
-               if (vd->vdev_mg == NULL) {
-                       ASSERT0(vdev_is_concrete(vd));
-                       vdev_metaslab_group_create(vd);
-               }
-#endif
                 error = metaslab_init(vd->vdev_mg, m, object, txg,
                     &(vd->vdev_ms[m]));
                 if (error != 0) {
@@ -1408,6 +1449,47 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
                 }
         }
  
+       /*
+        * Find the emptiest metaslab on the vdev and mark it for use for
+        * embedded slog by moving it from the regular to the log metaslab
+        * group.
+        */
+       if (vd->vdev_mg->mg_class == spa_normal_class(spa) &&
+           vd->vdev_ms_count > zfs_embedded_slog_min_ms &&
+           avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) {
+               uint64_t slog_msid = 0;
+               uint64_t smallest = UINT64_MAX;
+
+               /*
+                * Note, we only search the new metaslabs, because the old
+                * (pre-existing) ones may be active (e.g. have non-empty
+                * range_tree's), and we don't move them to the new
+                * metaslab_t.
+                */
+               for (uint64_t m = oldc; m < newc; m++) {
+                       uint64_t alloc =
+                           space_map_allocated(vd->vdev_ms[m]->ms_sm);
+                       if (alloc < smallest) {
+                               slog_msid = m;
+                               smallest = alloc;
+                       }
+               }
+               metaslab_t *slog_ms = vd->vdev_ms[slog_msid];
+               /*
+                * The metaslab was marked as dirty at the end of
+                * metaslab_init(). Remove it from the dirty list so that we
+                * can uninitialize and reinitialize it to the new class.
+                */
+               if (txg != 0) {
+                       (void) txg_list_remove_this(&vd->vdev_ms_list,
+                           slog_ms, txg);
+               }
+               uint64_t sm_obj = space_map_object(slog_ms->ms_sm);
+               metaslab_fini(slog_ms);
+               VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg,
+                   &vd->vdev_ms[slog_msid]));
+       }
+
         if (txg == 0)
                 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
  
@@ -1418,6 +1500,8 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
          */
         if (!expanding && !vd->vdev_removing) {
                 metaslab_group_activate(vd->vdev_mg);
+               if (vd->vdev_log_mg != NULL)
+                       metaslab_group_activate(vd->vdev_log_mg);
         }
  
         if (txg == 0)
@@ -1453,7 +1537,12 @@ vdev_metaslab_fini(vdev_t *vd)
  
         if (vd->vdev_ms != NULL) {
                 metaslab_group_t *mg = vd->vdev_mg;
+
                 metaslab_group_passivate(mg);
+               if (vd->vdev_log_mg != NULL) {
+                       ASSERT(!vd->vdev_islog);
+                       metaslab_group_passivate(vd->vdev_log_mg);
+               }
  
                 uint64_t count = vd->vdev_ms_count;
                 for (uint64_t m = 0; m < count; m++) {
@@ -1463,11 +1552,13 @@ vdev_metaslab_fini(vdev_t *vd)
                 }
                 vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
                 vd->vdev_ms = NULL;
-
                 vd->vdev_ms_count = 0;
  
-               for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+               for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
                         ASSERT0(mg->mg_histogram[i]);
+                       if (vd->vdev_log_mg != NULL)
+                               ASSERT0(vd->vdev_log_mg->mg_histogram[i]);
+               }
         }
         ASSERT0(vd->vdev_ms_count);
         ASSERT3U(vd->vdev_pending_fastwrite, ==, 0);
@@ -3531,8 +3622,11 @@ vdev_sync_done(vdev_t *vd, uint64_t txg)
             != NULL)
                 metaslab_sync_done(msp, txg);
  
-       if (reassess)
+       if (reassess) {
                 metaslab_sync_reassess(vd->vdev_mg);
+               if (vd->vdev_log_mg != NULL)
+                       metaslab_sync_reassess(vd->vdev_log_mg);
+       }
  }
  
  void
@@ -3856,6 +3950,7 @@ top:
                         /*
                          * Prevent any future allocations.
                          */
+                       ASSERT3P(tvd->vdev_log_mg, ==, NULL);
                         metaslab_group_passivate(mg);
                         (void) spa_vdev_state_exit(spa, vd, 0);
  
@@ -4256,6 +4351,12 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
                  */
                 if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
                     vdev_is_concrete(vd)) {
+                       /*
+                        * The vdev fragmentation rating doesn't take into
+                        * account the embedded slog metaslab (vdev_log_mg).
+                        * Since it's only one metaslab, it would have a tiny
+                        * impact on the overall fragmentation.
+                        */
                         vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
                             vd->vdev_mg->mg_fragmentation : 0;
                 }
@@ -5234,6 +5335,9 @@ ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW,
  ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW,
         "Disable cache flushes");
  
+ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, INT, ZMOD_RW,
+       "Minimum number of metaslabs required to dedicate one for log blocks");
+
  ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift,
         param_set_min_auto_ashift, param_get_ulong, ZMOD_RW,
         "Minimum ashift used when creating new top-level vdevs");